blob: 88bbf6a941e0485372b403b5139cad1f87ca3d6a [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
166#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000167
Mark Dickinson1f268282009-07-28 17:22:36 +0000168#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000169#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
170#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000171#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000172#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000173
174static unsigned int sre_lower_unicode(unsigned int ch)
175{
176 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
177}
178
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000179#endif
180
Guido van Rossumb700df92000-03-31 14:59:30 +0000181LOCAL(int)
182sre_category(SRE_CODE category, unsigned int ch)
183{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000184 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000185
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 case SRE_CATEGORY_DIGIT:
187 return SRE_IS_DIGIT(ch);
188 case SRE_CATEGORY_NOT_DIGIT:
189 return !SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_SPACE:
191 return SRE_IS_SPACE(ch);
192 case SRE_CATEGORY_NOT_SPACE:
193 return !SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_WORD:
195 return SRE_IS_WORD(ch);
196 case SRE_CATEGORY_NOT_WORD:
197 return !SRE_IS_WORD(ch);
198 case SRE_CATEGORY_LINEBREAK:
199 return SRE_IS_LINEBREAK(ch);
200 case SRE_CATEGORY_NOT_LINEBREAK:
201 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000202
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000203 case SRE_CATEGORY_LOC_WORD:
204 return SRE_LOC_IS_WORD(ch);
205 case SRE_CATEGORY_LOC_NOT_WORD:
206 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000207
208#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000209 case SRE_CATEGORY_UNI_DIGIT:
210 return SRE_UNI_IS_DIGIT(ch);
211 case SRE_CATEGORY_UNI_NOT_DIGIT:
212 return !SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_SPACE:
214 return SRE_UNI_IS_SPACE(ch);
215 case SRE_CATEGORY_UNI_NOT_SPACE:
216 return !SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_WORD:
218 return SRE_UNI_IS_WORD(ch);
219 case SRE_CATEGORY_UNI_NOT_WORD:
220 return !SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_LINEBREAK:
222 return SRE_UNI_IS_LINEBREAK(ch);
223 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
224 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000225#else
226 case SRE_CATEGORY_UNI_DIGIT:
227 return SRE_IS_DIGIT(ch);
228 case SRE_CATEGORY_UNI_NOT_DIGIT:
229 return !SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_SPACE:
231 return SRE_IS_SPACE(ch);
232 case SRE_CATEGORY_UNI_NOT_SPACE:
233 return !SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_WORD:
235 return SRE_LOC_IS_WORD(ch);
236 case SRE_CATEGORY_UNI_NOT_WORD:
237 return !SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_LINEBREAK:
239 return SRE_IS_LINEBREAK(ch);
240 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
241 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000242#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000243 }
244 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000245}
246
247/* helpers */
248
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000250data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000255 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000257}
258
259static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000260data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000261{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000262 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 minsize = state->data_stack_base+size;
264 cursize = state->data_stack_size;
265 if (cursize < minsize) {
266 void* stack;
267 cursize = minsize+minsize/4+1024;
268 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000270 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 return SRE_ERROR_MEMORY;
273 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000276 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000277 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000278}
279
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000280/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000281
282#define SRE_CHAR unsigned char
283#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000284#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000285#define SRE_CHARSET sre_charset
286#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000287#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000288#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000290#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000291
292#if defined(HAVE_UNICODE)
293
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000295#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000297
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000298#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#undef SRE_SEARCH
300#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000301#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000302#undef SRE_INFO
303#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000304#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000305#undef SRE_AT
306#undef SRE_CHAR
307
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000308/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000309
310#define SRE_CHAR Py_UNICODE
311#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000312#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000313#define SRE_CHARSET sre_ucharset
314#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000315#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000316#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000318#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000319#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000320
321#endif /* SRE_RECURSIVE */
322
323/* -------------------------------------------------------------------- */
324/* String matching engine */
325
326/* the following section is compiled twice, with different character
327 settings */
328
329LOCAL(int)
330SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
331{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000332 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000333
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000334 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000339 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 case SRE_AT_BEGINNING_LINE:
343 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000344 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000345
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000347 return (((void*) (ptr+1) == state->end &&
348 SRE_IS_LINEBREAK((int) ptr[0])) ||
349 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000350
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000351 case SRE_AT_END_LINE:
352 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000353 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000354
Fredrik Lundh770617b2001-01-14 15:06:11 +0000355 case SRE_AT_END_STRING:
356 return ((void*) ptr == state->end);
357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_BOUNDARY:
359 if (state->beginning == state->end)
360 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000362 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000366
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000367 case SRE_AT_NON_BOUNDARY:
368 if (state->beginning == state->end)
369 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000373 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000375
376 case SRE_AT_LOC_BOUNDARY:
377 if (state->beginning == state->end)
378 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000380 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000384
385 case SRE_AT_LOC_NON_BOUNDARY:
386 if (state->beginning == state->end)
387 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000390 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000392 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000393
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000394#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000395 case SRE_AT_UNI_BOUNDARY:
396 if (state->beginning == state->end)
397 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000398 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000399 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000400 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000401 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000403
404 case SRE_AT_UNI_NON_BOUNDARY:
405 if (state->beginning == state->end)
406 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000407 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000409 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000411 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000412#endif
413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000414 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000417}
418
419LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000420SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000421{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000423
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 for (;;) {
427 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000428
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000429 case SRE_OP_FAILURE:
430 return !ok;
431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000432 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000433 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 if (ch == set[0])
435 return ok;
436 set++;
437 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000438
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000439 case SRE_OP_CATEGORY:
440 /* <CATEGORY> <code> */
441 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000442 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000443 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Fredrik Lundh3562f112000-07-02 12:00:07 +0000446 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000447 if (sizeof(SRE_CODE) == 2) {
448 /* <CHARSET> <bitmap> (16 bits per code word) */
449 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
450 return ok;
451 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000452 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000453 else {
454 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800455 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000456 return ok;
457 set += 8;
458 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000459 break;
460
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000461 case SRE_OP_RANGE:
462 /* <RANGE> <lower> <upper> */
463 if (set[0] <= ch && ch <= set[1])
464 return ok;
465 set += 2;
466 break;
467
468 case SRE_OP_NEGATE:
469 ok = !ok;
470 break;
471
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000472 case SRE_OP_BIGCHARSET:
473 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
474 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000475 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000476 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000477
478 if (sizeof(SRE_CODE) == 2) {
479 block = ((unsigned char*)set)[ch >> 8];
480 set += 128;
481 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
482 return ok;
483 set += count*16;
484 }
485 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000486 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
487 * warnings when c's type supports only numbers < N+1 */
488 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000489 block = ((unsigned char*)set)[ch >> 8];
490 else
491 block = -1;
492 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000493 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800494 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000495 return ok;
496 set += count*8;
497 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000498 break;
499 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000500
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 default:
502 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000503 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 return 0;
505 }
506 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000507}
508
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000509LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000510
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000511LOCAL(Py_ssize_t)
512SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513{
514 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000515 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
516 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000517 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000518
519 /* adjust end */
520 if (maxcount < end - ptr && maxcount != 65535)
521 end = ptr + maxcount;
522
523 switch (pattern[0]) {
524
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000525 case SRE_OP_IN:
526 /* repeated set */
527 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
528 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
529 ptr++;
530 break;
531
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 case SRE_OP_ANY:
533 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000534 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000535 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
536 ptr++;
537 break;
538
539 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000540 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000542 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 ptr = end;
544 break;
545
546 case SRE_OP_LITERAL:
547 /* repeated literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 while (ptr < end && (SRE_CODE) *ptr == chr)
551 ptr++;
552 break;
553
554 case SRE_OP_LITERAL_IGNORE:
555 /* repeated literal */
556 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000557 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000558 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
559 ptr++;
560 break;
561
562 case SRE_OP_NOT_LITERAL:
563 /* repeated non-literal */
564 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000565 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000566 while (ptr < end && (SRE_CODE) *ptr != chr)
567 ptr++;
568 break;
Tim Peters3d563502006-01-21 02:47:53 +0000569
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000570 case SRE_OP_NOT_LITERAL_IGNORE:
571 /* repeated non-literal */
572 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000573 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000574 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
575 ptr++;
576 break;
577
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000578 default:
579 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000582 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 if (i < 0)
584 return i;
585 if (!i)
586 break;
587 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000588 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
589 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 return (SRE_CHAR*) state->ptr - ptr;
591 }
592
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000593 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000594 return ptr - (SRE_CHAR*) state->ptr;
595}
596
Fredrik Lundh33accc12000-08-27 20:59:47 +0000597#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000598LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000599SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
600{
601 /* check if an SRE_OP_INFO block matches at the current position.
602 returns the number of SRE_CODE objects to skip if successful, 0
603 if no match */
604
605 SRE_CHAR* end = state->end;
606 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000607 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000608
609 /* check minimal length */
610 if (pattern[3] && (end - ptr) < pattern[3])
611 return 0;
612
613 /* check known prefix */
614 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
615 /* <length> <skip> <prefix data> <overlap data> */
616 for (i = 0; i < pattern[5]; i++)
617 if ((SRE_CODE) ptr[i] != pattern[7 + i])
618 return 0;
619 return pattern[0] + 2 * pattern[6];
620 }
621 return pattern[0];
622}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000623#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000624
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000625/* The macros below should be used to protect recursive SRE_MATCH()
626 * calls that *failed* and do *not* return immediately (IOW, those
627 * that will backtrack). Explaining:
628 *
629 * - Recursive SRE_MATCH() returned true: that's usually a success
630 * (besides atypical cases like ASSERT_NOT), therefore there's no
631 * reason to restore lastmark;
632 *
633 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
634 * is returning to the caller: If the current SRE_MATCH() is the
635 * top function of the recursion, returning false will be a matching
636 * failure, and it doesn't matter where lastmark is pointing to.
637 * If it's *not* the top function, it will be a recursive SRE_MATCH()
638 * failure by itself, and the calling SRE_MATCH() will have to deal
639 * with the failure by the same rules explained here (it will restore
640 * lastmark by itself if necessary);
641 *
642 * - Recursive SRE_MATCH() returned false, and will continue the
643 * outside 'for' loop: must be protected when breaking, since the next
644 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000645 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000646 * - Recursive SRE_MATCH() returned false, and will be called again
647 * inside a local for/while loop: must be protected between each
648 * loop iteration, since the recursive SRE_MATCH() could do anything,
649 * and could potentially depend on lastmark.
650 *
651 * For more information, check the discussion at SF patch #712900.
652 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000653#define LASTMARK_SAVE() \
654 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000655 ctx->lastmark = state->lastmark; \
656 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000657 } while (0)
658#define LASTMARK_RESTORE() \
659 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000660 state->lastmark = ctx->lastmark; \
661 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000662 } while (0)
663
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000664#define RETURN_ERROR(i) do { return i; } while(0)
665#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
666#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
667
668#define RETURN_ON_ERROR(i) \
669 do { if (i < 0) RETURN_ERROR(i); } while (0)
670#define RETURN_ON_SUCCESS(i) \
671 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
672#define RETURN_ON_FAILURE(i) \
673 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
674
675#define SFY(x) #x
676
677#define DATA_STACK_ALLOC(state, type, ptr) \
678do { \
679 alloc_pos = state->data_stack_base; \
680 TRACE(("allocating %s in %d (%d)\n", \
681 SFY(type), alloc_pos, sizeof(type))); \
682 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
683 int j = data_stack_grow(state, sizeof(type)); \
684 if (j < 0) return j; \
685 if (ctx_pos != -1) \
686 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
687 } \
688 ptr = (type*)(state->data_stack+alloc_pos); \
689 state->data_stack_base += sizeof(type); \
690} while (0)
691
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000692#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
693do { \
694 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
695 ptr = (type*)(state->data_stack+pos); \
696} while (0)
697
698#define DATA_STACK_PUSH(state, data, size) \
699do { \
700 TRACE(("copy data in %p to %d (%d)\n", \
701 data, state->data_stack_base, size)); \
702 if (state->data_stack_size < state->data_stack_base+size) { \
703 int j = data_stack_grow(state, size); \
704 if (j < 0) return j; \
705 if (ctx_pos != -1) \
706 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
707 } \
708 memcpy(state->data_stack+state->data_stack_base, data, size); \
709 state->data_stack_base += size; \
710} while (0)
711
712#define DATA_STACK_POP(state, data, size, discard) \
713do { \
714 TRACE(("copy data to %p from %d (%d)\n", \
715 data, state->data_stack_base-size, size)); \
716 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
717 if (discard) \
718 state->data_stack_base -= size; \
719} while (0)
720
721#define DATA_STACK_POP_DISCARD(state, size) \
722do { \
723 TRACE(("discard data from %d (%d)\n", \
724 state->data_stack_base-size, size)); \
725 state->data_stack_base -= size; \
726} while(0)
727
728#define DATA_PUSH(x) \
729 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
730#define DATA_POP(x) \
731 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000732#define DATA_POP_DISCARD(x) \
733 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
734#define DATA_ALLOC(t,p) \
735 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000736#define DATA_LOOKUP_AT(t,p,pos) \
737 DATA_STACK_LOOKUP_AT(state,t,p,pos)
738
739#define MARK_PUSH(lastmark) \
740 do if (lastmark > 0) { \
741 i = lastmark; /* ctx->lastmark may change if reallocated */ \
742 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
743 } while (0)
744#define MARK_POP(lastmark) \
745 do if (lastmark > 0) { \
746 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
747 } while (0)
748#define MARK_POP_KEEP(lastmark) \
749 do if (lastmark > 0) { \
750 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
751 } while (0)
752#define MARK_POP_DISCARD(lastmark) \
753 do if (lastmark > 0) { \
754 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
755 } while (0)
756
757#define JUMP_NONE 0
758#define JUMP_MAX_UNTIL_1 1
759#define JUMP_MAX_UNTIL_2 2
760#define JUMP_MAX_UNTIL_3 3
761#define JUMP_MIN_UNTIL_1 4
762#define JUMP_MIN_UNTIL_2 5
763#define JUMP_MIN_UNTIL_3 6
764#define JUMP_REPEAT 7
765#define JUMP_REPEAT_ONE_1 8
766#define JUMP_REPEAT_ONE_2 9
767#define JUMP_MIN_REPEAT_ONE 10
768#define JUMP_BRANCH 11
769#define JUMP_ASSERT 12
770#define JUMP_ASSERT_NOT 13
771
772#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
773 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
774 nextctx->last_ctx_pos = ctx_pos; \
775 nextctx->jump = jumpvalue; \
776 nextctx->pattern = nextpattern; \
777 ctx_pos = alloc_pos; \
778 ctx = nextctx; \
779 goto entrance; \
780 jumplabel: \
781 while (0) /* gcc doesn't like labels at end of scopes */ \
782
783typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000784 Py_ssize_t last_ctx_pos;
785 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000786 SRE_CHAR* ptr;
787 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000788 Py_ssize_t count;
789 Py_ssize_t lastmark;
790 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000791 union {
792 SRE_CODE chr;
793 SRE_REPEAT* rep;
794 } u;
795} SRE_MATCH_CONTEXT;
796
797/* check if string matches the given pattern. returns <0 for
798 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000799LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000800SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000801{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000802 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000803 Py_ssize_t alloc_pos, ctx_pos = -1;
804 Py_ssize_t i, ret = 0;
805 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000806 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000807
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000808 SRE_MATCH_CONTEXT* ctx;
809 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000811 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000813 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
814 ctx->last_ctx_pos = -1;
815 ctx->jump = JUMP_NONE;
816 ctx->pattern = pattern;
817 ctx_pos = alloc_pos;
818
819entrance:
820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000821 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000822
823 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000824 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000825 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000826 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000828 (end - ctx->ptr), ctx->pattern[3]));
829 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000832 }
833
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000834 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000835 ++sigcount;
836 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
837 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000840
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 case SRE_OP_MARK:
842 /* set mark */
843 /* <MARK> <gid> */
844 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
845 ctx->ptr, ctx->pattern[0]));
846 i = ctx->pattern[0];
847 if (i & 1)
848 state->lastindex = i/2 + 1;
849 if (i > state->lastmark) {
850 /* state->lastmark is the highest valid index in the
851 state->mark array. If it is increased by more than 1,
852 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000853 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000854 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000855 while (j < i)
856 state->mark[j++] = NULL;
857 state->lastmark = i;
858 }
859 state->mark[i] = ctx->ptr;
860 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000862
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 case SRE_OP_LITERAL:
864 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
867 ctx->ptr, *ctx->pattern));
868 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
869 RETURN_FAILURE;
870 ctx->pattern++;
871 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000873
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 case SRE_OP_NOT_LITERAL:
875 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000876 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000877 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_SUCCESS:
886 /* end of pattern */
887 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
888 state->ptr = ctx->ptr;
889 RETURN_SUCCESS;
890
891 case SRE_OP_AT:
892 /* match at given position */
893 /* <AT> <code> */
894 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
895 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
896 RETURN_FAILURE;
897 ctx->pattern++;
898 break;
899
900 case SRE_OP_CATEGORY:
901 /* match at given category */
902 /* <CATEGORY> <code> */
903 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
904 ctx->ptr, *ctx->pattern));
905 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
906 RETURN_FAILURE;
907 ctx->pattern++;
908 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000912 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
915 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
916 RETURN_FAILURE;
917 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000918 break;
919
920 case SRE_OP_ANY_ALL:
921 /* match anything */
922 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
924 if (ctx->ptr >= end)
925 RETURN_FAILURE;
926 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 case SRE_OP_IN:
930 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000931 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000932 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
934 RETURN_FAILURE;
935 ctx->pattern += ctx->pattern[0];
936 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
941 ctx->pattern, ctx->ptr, ctx->pattern[0]));
942 if (ctx->ptr >= end ||
943 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
944 RETURN_FAILURE;
945 ctx->pattern++;
946 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
951 ctx->pattern, ctx->ptr, *ctx->pattern));
952 if (ctx->ptr >= end ||
953 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
954 RETURN_FAILURE;
955 ctx->pattern++;
956 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
961 if (ctx->ptr >= end
962 || !SRE_CHARSET(ctx->pattern+1,
963 (SRE_CODE)state->lower(*ctx->ptr)))
964 RETURN_FAILURE;
965 ctx->pattern += ctx->pattern[0];
966 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000968
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000969 case SRE_OP_JUMP:
970 case SRE_OP_INFO:
971 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000972 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
974 ctx->ptr, ctx->pattern[0]));
975 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 break;
977
978 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* alternation */
980 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000982 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 ctx->u.rep = state->repeat;
984 if (ctx->u.rep)
985 MARK_PUSH(ctx->lastmark);
986 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
987 if (ctx->pattern[1] == SRE_OP_LITERAL &&
988 (ctx->ptr >= end ||
989 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000990 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000991 if (ctx->pattern[1] == SRE_OP_IN &&
992 (ctx->ptr >= end ||
993 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000994 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 if (ret) {
998 if (ctx->u.rep)
999 MARK_POP_DISCARD(ctx->lastmark);
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001002 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 if (ctx->u.rep)
1004 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001005 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001006 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001007 if (ctx->u.rep)
1008 MARK_POP_DISCARD(ctx->lastmark);
1009 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001010
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011 case SRE_OP_REPEAT_ONE:
1012 /* match repeated sequence (maximizing regexp) */
1013
1014 /* this operator only works if the repeated item is
1015 exactly one character wide, and we're not already
1016 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001017 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
1019 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1020
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1022 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 if (ctx->ptr + ctx->pattern[1] > end)
1025 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001029 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1030 RETURN_ON_ERROR(ret);
1031 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1032 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034
1035 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 string. check if the rest of the pattern matches,
1038 and backtrack if not. */
1039
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001040 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001045 state->ptr = ctx->ptr;
1046 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001049 LASTMARK_SAVE();
1050
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 /* tail starts with a literal. skip positions where
1053 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001056 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1058 ctx->ptr--;
1059 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001061 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1065 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 if (ret) {
1067 RETURN_ON_ERROR(ret);
1068 RETURN_SUCCESS;
1069 }
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001071 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001072
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 ctx->ptr--;
1074 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 }
1076
1077 } else {
1078 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001079 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1082 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 if (ret) {
1084 RETURN_ON_ERROR(ret);
1085 RETURN_SUCCESS;
1086 }
1087 ctx->ptr--;
1088 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001089 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001090 }
1091 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 case SRE_OP_MIN_REPEAT_ONE:
1095 /* match repeated sequence (minimizing regexp) */
1096
1097 /* this operator only works if the repeated item is
1098 exactly one character wide, and we're not already
1099 collecting backtracking points. for other cases,
1100 use the MIN_REPEAT operator */
1101
1102 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1105 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 if (ctx->ptr + ctx->pattern[1] > end)
1108 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001112 if (ctx->pattern[1] == 0)
1113 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114 else {
1115 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001116 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1117 RETURN_ON_ERROR(ret);
1118 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001119 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001120 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 RETURN_FAILURE;
1122 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001125 }
1126
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131
1132 } else {
1133 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001134 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001135 while ((Py_ssize_t)ctx->pattern[2] == 65535
1136 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001137 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1143 }
1144 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001147 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 assert(ret == 1);
1151 ctx->ptr++;
1152 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001153 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001154 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001158 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001160 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1163 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001166 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001167 if (!ctx->u.rep) {
1168 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001169 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001170 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001171 ctx->u.rep->count = -1;
1172 ctx->u.rep->pattern = ctx->pattern;
1173 ctx->u.rep->prev = state->repeat;
1174 ctx->u.rep->last_ptr = NULL;
1175 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001176
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001177 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001180 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 if (ret) {
1183 RETURN_ON_ERROR(ret);
1184 RETURN_SUCCESS;
1185 }
1186 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001187
1188 case SRE_OP_MAX_UNTIL:
1189 /* maximizing repeat */
1190 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1191
1192 /* FIXME: we probably need to deal with zero-width
1193 matches in here... */
1194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 ctx->u.rep = state->repeat;
1196 if (!ctx->u.rep)
1197 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001201 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001202
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1204 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1210 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 if (ret) {
1212 RETURN_ON_ERROR(ret);
1213 RETURN_SUCCESS;
1214 }
1215 ctx->u.rep->count = ctx->count-1;
1216 state->ptr = ctx->ptr;
1217 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001218 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001219
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001220 if ((ctx->count < ctx->u.rep->pattern[2] ||
1221 ctx->u.rep->pattern[2] == 65535) &&
1222 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001223 /* we may have enough matches, but if we can
1224 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001226 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 MARK_PUSH(ctx->lastmark);
1228 /* zero-width match protection */
1229 DATA_PUSH(&ctx->u.rep->last_ptr);
1230 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1232 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001233 DATA_POP(&ctx->u.rep->last_ptr);
1234 if (ret) {
1235 MARK_POP_DISCARD(ctx->lastmark);
1236 RETURN_ON_ERROR(ret);
1237 RETURN_SUCCESS;
1238 }
1239 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001240 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 ctx->u.rep->count = ctx->count-1;
1242 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001243 }
1244
1245 /* cannot match more repeated items here. make sure the
1246 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001249 RETURN_ON_SUCCESS(ret);
1250 state->repeat = ctx->u.rep;
1251 state->ptr = ctx->ptr;
1252 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001253
1254 case SRE_OP_MIN_UNTIL:
1255 /* minimizing repeat */
1256 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 ctx->u.rep = state->repeat;
1259 if (!ctx->u.rep)
1260 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001263
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1267 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001270 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1273 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 if (ret) {
1275 RETURN_ON_ERROR(ret);
1276 RETURN_SUCCESS;
1277 }
1278 ctx->u.rep->count = ctx->count-1;
1279 state->ptr = ctx->ptr;
1280 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281 }
1282
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001283 LASTMARK_SAVE();
1284
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001285 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001286 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001288 if (ret) {
1289 RETURN_ON_ERROR(ret);
1290 RETURN_SUCCESS;
1291 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001292
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 state->repeat = ctx->u.rep;
1294 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001295
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001296 LASTMARK_RESTORE();
1297
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 if (ctx->count >= ctx->u.rep->pattern[2]
1299 && ctx->u.rep->pattern[2] != 65535)
1300 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001301
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001302 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001303 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1304 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (ret) {
1306 RETURN_ON_ERROR(ret);
1307 RETURN_SUCCESS;
1308 }
1309 ctx->u.rep->count = ctx->count-1;
1310 state->ptr = ctx->ptr;
1311 RETURN_FAILURE;
1312
1313 case SRE_OP_GROUPREF:
1314 /* match backreference */
1315 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1316 ctx->ptr, ctx->pattern[0]));
1317 i = ctx->pattern[0];
1318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001319 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001320 if (groupref >= state->lastmark) {
1321 RETURN_FAILURE;
1322 } else {
1323 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1324 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1325 if (!p || !e || e < p)
1326 RETURN_FAILURE;
1327 while (p < e) {
1328 if (ctx->ptr >= end || *ctx->ptr != *p)
1329 RETURN_FAILURE;
1330 p++; ctx->ptr++;
1331 }
1332 }
1333 }
1334 ctx->pattern++;
1335 break;
1336
1337 case SRE_OP_GROUPREF_IGNORE:
1338 /* match backreference */
1339 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1340 ctx->ptr, ctx->pattern[0]));
1341 i = ctx->pattern[0];
1342 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001343 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001344 if (groupref >= state->lastmark) {
1345 RETURN_FAILURE;
1346 } else {
1347 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1348 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1349 if (!p || !e || e < p)
1350 RETURN_FAILURE;
1351 while (p < e) {
1352 if (ctx->ptr >= end ||
1353 state->lower(*ctx->ptr) != state->lower(*p))
1354 RETURN_FAILURE;
1355 p++; ctx->ptr++;
1356 }
1357 }
1358 }
1359 ctx->pattern++;
1360 break;
1361
1362 case SRE_OP_GROUPREF_EXISTS:
1363 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1364 ctx->ptr, ctx->pattern[0]));
1365 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1366 i = ctx->pattern[0];
1367 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001368 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001369 if (groupref >= state->lastmark) {
1370 ctx->pattern += ctx->pattern[1];
1371 break;
1372 } else {
1373 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1374 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1375 if (!p || !e || e < p) {
1376 ctx->pattern += ctx->pattern[1];
1377 break;
1378 }
1379 }
1380 }
1381 ctx->pattern += 2;
1382 break;
1383
1384 case SRE_OP_ASSERT:
1385 /* assert subpattern */
1386 /* <ASSERT> <skip> <back> <pattern> */
1387 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1388 ctx->ptr, ctx->pattern[1]));
1389 state->ptr = ctx->ptr - ctx->pattern[1];
1390 if (state->ptr < state->beginning)
1391 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001392 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001393 RETURN_ON_FAILURE(ret);
1394 ctx->pattern += ctx->pattern[0];
1395 break;
1396
1397 case SRE_OP_ASSERT_NOT:
1398 /* assert not subpattern */
1399 /* <ASSERT_NOT> <skip> <back> <pattern> */
1400 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1401 ctx->ptr, ctx->pattern[1]));
1402 state->ptr = ctx->ptr - ctx->pattern[1];
1403 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001404 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001405 if (ret) {
1406 RETURN_ON_ERROR(ret);
1407 RETURN_FAILURE;
1408 }
1409 }
1410 ctx->pattern += ctx->pattern[0];
1411 break;
1412
1413 case SRE_OP_FAILURE:
1414 /* immediate failure */
1415 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1416 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001417
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001419 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1420 ctx->pattern[-1]));
1421 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001422 }
1423 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001424
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001425exit:
1426 ctx_pos = ctx->last_ctx_pos;
1427 jump = ctx->jump;
1428 DATA_POP_DISCARD(ctx);
1429 if (ctx_pos == -1)
1430 return ret;
1431 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1432
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001433 switch (jump) {
1434 case JUMP_MAX_UNTIL_2:
1435 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1436 goto jump_max_until_2;
1437 case JUMP_MAX_UNTIL_3:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_3;
1440 case JUMP_MIN_UNTIL_2:
1441 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1442 goto jump_min_until_2;
1443 case JUMP_MIN_UNTIL_3:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_3;
1446 case JUMP_BRANCH:
1447 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1448 goto jump_branch;
1449 case JUMP_MAX_UNTIL_1:
1450 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1451 goto jump_max_until_1;
1452 case JUMP_MIN_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_min_until_1;
1455 case JUMP_REPEAT:
1456 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1457 goto jump_repeat;
1458 case JUMP_REPEAT_ONE_1:
1459 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat_one_1;
1461 case JUMP_REPEAT_ONE_2:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_2;
1464 case JUMP_MIN_REPEAT_ONE:
1465 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1466 goto jump_min_repeat_one;
1467 case JUMP_ASSERT:
1468 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1469 goto jump_assert;
1470 case JUMP_ASSERT_NOT:
1471 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert_not;
1473 case JUMP_NONE:
1474 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1475 break;
1476 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001477
1478 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001479}
1480
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001481LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001482SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1483{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001484 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1485 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001486 Py_ssize_t status = 0;
1487 Py_ssize_t prefix_len = 0;
1488 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001489 SRE_CODE* prefix = NULL;
1490 SRE_CODE* charset = NULL;
1491 SRE_CODE* overlap = NULL;
1492 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001493
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001494 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001496 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001497
1498 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001500 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001501 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001503 end -= pattern[3]-1;
1504 if (end <= ptr)
1505 end = ptr+1;
1506 }
1507
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001509 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001510 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001511 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001512 prefix_skip = pattern[6];
1513 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 overlap = prefix + prefix_len - 1;
1515 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001516 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001518 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001519
1520 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001521 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001522
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001523 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1524 TRACE(("charset = %p\n", charset));
1525
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001527 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001528 /* pattern starts with a known prefix. use the overlap
1529 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001530 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001531 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001532 while (ptr < end) {
1533 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001534 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 if (!i)
1536 break;
1537 else
1538 i = overlap[i];
1539 } else {
1540 if (++i == prefix_len) {
1541 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001542 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1543 state->start = ptr + 1 - prefix_len;
1544 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001545 if (flags & SRE_INFO_LITERAL)
1546 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001547 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001548 if (status != 0)
1549 return status;
1550 /* close but no cigar -- try again */
1551 i = overlap[i];
1552 }
1553 break;
1554 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001555 }
1556 ptr++;
1557 }
1558 return 0;
1559 }
1560#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001561
Fredrik Lundh3562f112000-07-02 12:00:07 +00001562 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001564 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 for (;;) {
1568 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1569 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001570 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001572 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 state->start = ptr;
1574 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001575 if (flags & SRE_INFO_LITERAL)
1576 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001577 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 if (status != 0)
1579 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001580 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 } else if (charset) {
1582 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001583 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001587 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001589 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 state->start = ptr;
1591 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001592 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 if (status != 0)
1594 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001595 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 }
1597 } else
1598 /* general case */
1599 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001600 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001602 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 if (status != 0)
1604 break;
1605 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001606
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001607 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001608}
Tim Peters3d563502006-01-21 02:47:53 +00001609
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001610LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001611SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001612{
1613 /* check if given string is a literal template (i.e. no escapes) */
1614 while (len-- > 0)
1615 if (*ptr++ == '\\')
1616 return 0;
1617 return 1;
1618}
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001620#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001621
1622/* -------------------------------------------------------------------- */
1623/* factories and destructors */
1624
1625/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001626static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1627static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001628
1629static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001630sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001631{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001632 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001633}
1634
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001637{
1638 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001639 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640 return NULL;
1641 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001642 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001644#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001646#else
1647 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001648#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001650}
1651
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001652LOCAL(void)
1653state_reset(SRE_STATE* state)
1654{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001658 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659 state->lastindex = -1;
1660
1661 state->repeat = NULL;
1662
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664}
1665
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001666static void*
Benjamin Petersone48944b2012-03-07 14:50:25 -06001667getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize, Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001668{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001669 /* given a python object, return a data pointer, a length (in
1670 characters), and a character size. return NULL if the object
1671 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001674 Py_ssize_t size, bytes;
1675 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001677
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001678 /* Unicode objects do not support the buffer API. So, get the data
1679 directly instead. */
1680 if (PyUnicode_Check(string)) {
1681 ptr = (void *)PyUnicode_AS_DATA(string);
1682 *p_length = PyUnicode_GET_SIZE(string);
1683 *p_charsize = sizeof(Py_UNICODE);
1684 return ptr;
1685 }
1686
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001687 /* get pointer to string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001688 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001689 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001690 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001691 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001692 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1693 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001694 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001697 bytes = view->len;
1698 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001699
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001700 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001702 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001703 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001704
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001705 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001707
Christian Heimes72b710a2008-05-26 13:28:38 +00001708 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001709 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001710#if defined(HAVE_UNICODE)
Antoine Pitroufd036452008-08-19 17:56:33 +00001711 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001712 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001713#endif
1714 else {
1715 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001716 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001717 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001718
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001719 *p_length = size;
1720 *p_charsize = charsize;
1721
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001722 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001723 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001724 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001725 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001726 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001727 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001728 err:
1729 PyBuffer_Release(view);
1730 view->buf = NULL;
1731 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001732}
1733
1734LOCAL(PyObject*)
1735state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001736 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001737{
1738 /* prepare state object */
1739
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001740 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741 int charsize;
1742 void* ptr;
1743
1744 memset(state, 0, sizeof(SRE_STATE));
1745
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001746 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001747 state->lastindex = -1;
1748
Benjamin Petersone48944b2012-03-07 14:50:25 -06001749 state->buffer.buf = NULL;
1750 ptr = getstring(string, &length, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001752 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001753
Benjamin Petersone48944b2012-03-07 14:50:25 -06001754 if (charsize == 1 && pattern->charsize > 1) {
1755 PyErr_SetString(PyExc_TypeError,
Antoine Pitroufd036452008-08-19 17:56:33 +00001756 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001757 goto err;
1758 }
1759 if (charsize > 1 && pattern->charsize == 1) {
1760 PyErr_SetString(PyExc_TypeError,
Antoine Pitroufd036452008-08-19 17:56:33 +00001761 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001762 goto err;
1763 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001764
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001765 /* adjust boundaries */
1766 if (start < 0)
1767 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001768 else if (start > length)
1769 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 if (end < 0)
1772 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001773 else if (end > length)
1774 end = length;
1775
1776 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001777
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001778 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001779
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001780 state->start = (void*) ((char*) ptr + start * state->charsize);
1781 state->end = (void*) ((char*) ptr + end * state->charsize);
1782
1783 Py_INCREF(string);
1784 state->string = string;
1785 state->pos = start;
1786 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001787
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001788 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001789 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001790 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001791#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001793#else
1794 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001795#endif
1796 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001797 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001798
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001799 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001800 err:
1801 if (state->buffer.buf)
1802 PyBuffer_Release(&state->buffer);
1803 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001804}
1805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806LOCAL(void)
1807state_fini(SRE_STATE* state)
1808{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001809 if (state->buffer.buf)
1810 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001811 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001812 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001813}
1814
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001815/* calculate offset from start of string */
1816#define STATE_OFFSET(state, member)\
1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1818
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001819LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001820state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001821{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001822 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001823
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001824 index = (index - 1) * 2;
1825
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001826 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001827 if (empty)
1828 /* want empty string */
1829 i = j = 0;
1830 else {
1831 Py_INCREF(Py_None);
1832 return Py_None;
1833 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001834 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001835 i = STATE_OFFSET(state, state->mark[index]);
1836 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001837 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001838
Fredrik Lundh58100642000-08-09 09:14:35 +00001839 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001840}
1841
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001842static void
1843pattern_error(int status)
1844{
1845 switch (status) {
1846 case SRE_ERROR_RECURSION_LIMIT:
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "maximum recursion limit exceeded"
1850 );
1851 break;
1852 case SRE_ERROR_MEMORY:
1853 PyErr_NoMemory();
1854 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001855 case SRE_ERROR_INTERRUPTED:
1856 /* An exception has already been raised, so let it fly */
1857 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001858 default:
1859 /* other error codes indicate compiler/engine bugs */
1860 PyErr_SetString(
1861 PyExc_RuntimeError,
1862 "internal error in regular expression engine"
1863 );
1864 }
1865}
1866
Guido van Rossumb700df92000-03-31 14:59:30 +00001867static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001868pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001869{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001870 if (self->weakreflist != NULL)
1871 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001872 if (self->view.buf)
1873 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001874 Py_XDECREF(self->pattern);
1875 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001876 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001877 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001878}
1879
1880static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001881pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001882{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 SRE_STATE state;
1884 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001885
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001887 Py_ssize_t start = 0;
1888 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001889 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001890 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001891 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001894 string = state_init(&state, self, string, start, end);
1895 if (!string)
1896 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001897
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 state.ptr = state.start;
1899
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001900 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1901
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001903 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001905#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001906 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001907#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001909
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001910 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001911 if (PyErr_Occurred())
1912 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001914 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001917}
1918
1919static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001920pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001921{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 SRE_STATE state;
1923 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001926 Py_ssize_t start = 0;
1927 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001928 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001929 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001930 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001931 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 string = state_init(&state, self, string, start, end);
1934 if (!string)
1935 return NULL;
1936
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001937 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 if (state.charsize == 1) {
1940 status = sre_search(&state, PatternObject_GetCode(self));
1941 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001942#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001943 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001944#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001946
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001947 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001949 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001950
Thomas Wouters89f507f2006-12-13 04:49:30 +00001951 if (PyErr_Occurred())
1952 return NULL;
1953
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001954 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001955}
1956
1957static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959{
1960 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001961 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001962 PyObject* func;
1963 PyObject* result;
1964
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001965 if (!args)
1966 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001967 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 if (!name)
1969 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001970 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001972 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001974 func = PyObject_GetAttrString(mod, function);
1975 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001976 if (!func)
1977 return NULL;
1978 result = PyObject_CallObject(func, args);
1979 Py_DECREF(func);
1980 Py_DECREF(args);
1981 return result;
1982}
1983
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001984#ifdef USE_BUILTIN_COPY
1985static int
1986deepcopy(PyObject** object, PyObject* memo)
1987{
1988 PyObject* copy;
1989
1990 copy = call(
1991 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001992 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001993 );
1994 if (!copy)
1995 return 0;
1996
1997 Py_DECREF(*object);
1998 *object = copy;
1999
2000 return 1; /* success */
2001}
2002#endif
2003
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002004static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002005join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002006{
2007 /* join list elements */
2008
2009 PyObject* joiner;
2010#if PY_VERSION_HEX >= 0x01060000
2011 PyObject* function;
2012 PyObject* args;
2013#endif
2014 PyObject* result;
2015
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002016 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002017 if (!joiner)
2018 return NULL;
2019
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002020 if (PyList_GET_SIZE(list) == 0) {
2021 Py_DECREF(list);
2022 return joiner;
2023 }
2024
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002025#if PY_VERSION_HEX >= 0x01060000
2026 function = PyObject_GetAttrString(joiner, "join");
2027 if (!function) {
2028 Py_DECREF(joiner);
2029 return NULL;
2030 }
2031 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002032 if (!args) {
2033 Py_DECREF(function);
2034 Py_DECREF(joiner);
2035 return NULL;
2036 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002037 PyTuple_SET_ITEM(args, 0, list);
2038 result = PyObject_CallObject(function, args);
2039 Py_DECREF(args); /* also removes list */
2040 Py_DECREF(function);
2041#else
2042 result = call(
2043 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002044 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002045 );
2046#endif
2047 Py_DECREF(joiner);
2048
2049 return result;
2050}
2051
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002052static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002053pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002054{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 SRE_STATE state;
2056 PyObject* list;
2057 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002058 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002061 Py_ssize_t start = 0;
2062 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002063 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002064 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002065 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 string = state_init(&state, self, string, start, end);
2069 if (!string)
2070 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002071
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002073 if (!list) {
2074 state_fini(&state);
2075 return NULL;
2076 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002077
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002078 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002079
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002080 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002081
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002082 state_reset(&state);
2083
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002084 state.ptr = state.start;
2085
2086 if (state.charsize == 1) {
2087 status = sre_search(&state, PatternObject_GetCode(self));
2088 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002089#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002090 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002091#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002092 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002093
Thomas Wouters89f507f2006-12-13 04:49:30 +00002094 if (PyErr_Occurred())
2095 goto error;
2096
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002097 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002098 if (status == 0)
2099 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002100 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002101 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 }
Tim Peters3d563502006-01-21 02:47:53 +00002103
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002104 /* don't bother to build a match object */
2105 switch (self->groups) {
2106 case 0:
2107 b = STATE_OFFSET(&state, state.start);
2108 e = STATE_OFFSET(&state, state.ptr);
2109 item = PySequence_GetSlice(string, b, e);
2110 if (!item)
2111 goto error;
2112 break;
2113 case 1:
2114 item = state_getslice(&state, 1, string, 1);
2115 if (!item)
2116 goto error;
2117 break;
2118 default:
2119 item = PyTuple_New(self->groups);
2120 if (!item)
2121 goto error;
2122 for (i = 0; i < self->groups; i++) {
2123 PyObject* o = state_getslice(&state, i+1, string, 1);
2124 if (!o) {
2125 Py_DECREF(item);
2126 goto error;
2127 }
2128 PyTuple_SET_ITEM(item, i, o);
2129 }
2130 break;
2131 }
2132
2133 status = PyList_Append(list, item);
2134 Py_DECREF(item);
2135 if (status < 0)
2136 goto error;
2137
2138 if (state.ptr == state.start)
2139 state.start = (void*) ((char*) state.ptr + state.charsize);
2140 else
2141 state.start = state.ptr;
2142
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002144
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002145 state_fini(&state);
2146 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002147
2148error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002149 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002150 state_fini(&state);
2151 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002152
Guido van Rossumb700df92000-03-31 14:59:30 +00002153}
2154
Fredrik Lundh703ce812001-10-24 22:16:30 +00002155#if PY_VERSION_HEX >= 0x02020000
2156static PyObject*
2157pattern_finditer(PatternObject* pattern, PyObject* args)
2158{
2159 PyObject* scanner;
2160 PyObject* search;
2161 PyObject* iterator;
2162
2163 scanner = pattern_scanner(pattern, args);
2164 if (!scanner)
2165 return NULL;
2166
2167 search = PyObject_GetAttrString(scanner, "search");
2168 Py_DECREF(scanner);
2169 if (!search)
2170 return NULL;
2171
2172 iterator = PyCallIter_New(search, Py_None);
2173 Py_DECREF(search);
2174
2175 return iterator;
2176}
2177#endif
2178
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002179static PyObject*
2180pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2181{
2182 SRE_STATE state;
2183 PyObject* list;
2184 PyObject* item;
2185 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002186 Py_ssize_t n;
2187 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002188 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002189
2190 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002191 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002192 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002193 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002194 &string, &maxsplit))
2195 return NULL;
2196
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002197 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002198 if (!string)
2199 return NULL;
2200
2201 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002202 if (!list) {
2203 state_fini(&state);
2204 return NULL;
2205 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002206
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002207 n = 0;
2208 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002209
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002210 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002211
2212 state_reset(&state);
2213
2214 state.ptr = state.start;
2215
2216 if (state.charsize == 1) {
2217 status = sre_search(&state, PatternObject_GetCode(self));
2218 } else {
2219#if defined(HAVE_UNICODE)
2220 status = sre_usearch(&state, PatternObject_GetCode(self));
2221#endif
2222 }
2223
Thomas Wouters89f507f2006-12-13 04:49:30 +00002224 if (PyErr_Occurred())
2225 goto error;
2226
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002227 if (status <= 0) {
2228 if (status == 0)
2229 break;
2230 pattern_error(status);
2231 goto error;
2232 }
Tim Peters3d563502006-01-21 02:47:53 +00002233
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002234 if (state.start == state.ptr) {
2235 if (last == state.end)
2236 break;
2237 /* skip one character */
2238 state.start = (void*) ((char*) state.ptr + state.charsize);
2239 continue;
2240 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002241
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002242 /* get segment before this match */
2243 item = PySequence_GetSlice(
2244 string, STATE_OFFSET(&state, last),
2245 STATE_OFFSET(&state, state.start)
2246 );
2247 if (!item)
2248 goto error;
2249 status = PyList_Append(list, item);
2250 Py_DECREF(item);
2251 if (status < 0)
2252 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002253
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002254 /* add groups (if any) */
2255 for (i = 0; i < self->groups; i++) {
2256 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002257 if (!item)
2258 goto error;
2259 status = PyList_Append(list, item);
2260 Py_DECREF(item);
2261 if (status < 0)
2262 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002263 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002264
2265 n = n + 1;
2266
2267 last = state.start = state.ptr;
2268
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002269 }
2270
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002271 /* get segment following last match (even if empty) */
2272 item = PySequence_GetSlice(
2273 string, STATE_OFFSET(&state, last), state.endpos
2274 );
2275 if (!item)
2276 goto error;
2277 status = PyList_Append(list, item);
2278 Py_DECREF(item);
2279 if (status < 0)
2280 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002281
2282 state_fini(&state);
2283 return list;
2284
2285error:
2286 Py_DECREF(list);
2287 state_fini(&state);
2288 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002289
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002290}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002291
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002292static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002293pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002294 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002295{
2296 SRE_STATE state;
2297 PyObject* list;
2298 PyObject* item;
2299 PyObject* filter;
2300 PyObject* args;
2301 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002302 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002303 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002304 Py_ssize_t n;
2305 Py_ssize_t i, b, e;
2306 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002307 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002308 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002309
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002310 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002311 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002312 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002313 Py_INCREF(filter);
2314 filter_is_callable = 1;
2315 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002316 /* if not callable, check if it's a literal string */
2317 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002318 view.buf = NULL;
2319 ptr = getstring(ptemplate, &n, &bint, &view);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002320 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002321 if (ptr) {
2322 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002323 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002324 } else {
2325#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002326 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002327#endif
2328 }
2329 } else {
2330 PyErr_Clear();
2331 literal = 0;
2332 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002333 if (view.buf)
2334 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002335 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002336 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002337 Py_INCREF(filter);
2338 filter_is_callable = 0;
2339 } else {
2340 /* not a literal; hand it over to the template compiler */
2341 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002342 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002343 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002344 );
2345 if (!filter)
2346 return NULL;
2347 filter_is_callable = PyCallable_Check(filter);
2348 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002349 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002350
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002351 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002352 if (!string) {
2353 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002354 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002355 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002356
2357 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002358 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002359 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002360 state_fini(&state);
2361 return NULL;
2362 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002363
2364 n = i = 0;
2365
2366 while (!count || n < count) {
2367
2368 state_reset(&state);
2369
2370 state.ptr = state.start;
2371
2372 if (state.charsize == 1) {
2373 status = sre_search(&state, PatternObject_GetCode(self));
2374 } else {
2375#if defined(HAVE_UNICODE)
2376 status = sre_usearch(&state, PatternObject_GetCode(self));
2377#endif
2378 }
2379
Thomas Wouters89f507f2006-12-13 04:49:30 +00002380 if (PyErr_Occurred())
2381 goto error;
2382
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002383 if (status <= 0) {
2384 if (status == 0)
2385 break;
2386 pattern_error(status);
2387 goto error;
2388 }
Tim Peters3d563502006-01-21 02:47:53 +00002389
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002390 b = STATE_OFFSET(&state, state.start);
2391 e = STATE_OFFSET(&state, state.ptr);
2392
2393 if (i < b) {
2394 /* get segment before this match */
2395 item = PySequence_GetSlice(string, i, b);
2396 if (!item)
2397 goto error;
2398 status = PyList_Append(list, item);
2399 Py_DECREF(item);
2400 if (status < 0)
2401 goto error;
2402
2403 } else if (i == b && i == e && n > 0)
2404 /* ignore empty match on latest position */
2405 goto next;
2406
2407 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002408 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002409 match = pattern_new_match(self, &state, 1);
2410 if (!match)
2411 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002412 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002413 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002414 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002415 goto error;
2416 }
2417 item = PyObject_CallObject(filter, args);
2418 Py_DECREF(args);
2419 Py_DECREF(match);
2420 if (!item)
2421 goto error;
2422 } else {
2423 /* filter is literal string */
2424 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002425 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002426 }
2427
2428 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002429 if (item != Py_None) {
2430 status = PyList_Append(list, item);
2431 Py_DECREF(item);
2432 if (status < 0)
2433 goto error;
2434 }
Tim Peters3d563502006-01-21 02:47:53 +00002435
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002436 i = e;
2437 n = n + 1;
2438
2439next:
2440 /* move on */
2441 if (state.ptr == state.start)
2442 state.start = (void*) ((char*) state.ptr + state.charsize);
2443 else
2444 state.start = state.ptr;
2445
2446 }
2447
2448 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002449 if (i < state.endpos) {
2450 item = PySequence_GetSlice(string, i, state.endpos);
2451 if (!item)
2452 goto error;
2453 status = PyList_Append(list, item);
2454 Py_DECREF(item);
2455 if (status < 0)
2456 goto error;
2457 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002458
2459 state_fini(&state);
2460
Guido van Rossum4e173842001-12-07 04:25:10 +00002461 Py_DECREF(filter);
2462
Fredrik Lundhdac58492001-10-21 21:48:30 +00002463 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002464 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002465
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 if (!item)
2467 return NULL;
2468
2469 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002470 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002471
2472 return item;
2473
2474error:
2475 Py_DECREF(list);
2476 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002477 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002478 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002479
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002480}
2481
2482static PyObject*
2483pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2484{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002485 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002486 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002487 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002488 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002489 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002491 return NULL;
2492
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002493 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002494}
2495
2496static PyObject*
2497pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2498{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002499 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002500 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002501 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002502 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002503 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002504 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002505 return NULL;
2506
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002507 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002508}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002509
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002510static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002511pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002512{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002513#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002514 PatternObject* copy;
2515 int offset;
2516
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002517 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2518 if (!copy)
2519 return NULL;
2520
2521 offset = offsetof(PatternObject, groups);
2522
2523 Py_XINCREF(self->groupindex);
2524 Py_XINCREF(self->indexgroup);
2525 Py_XINCREF(self->pattern);
2526
2527 memcpy((char*) copy + offset, (char*) self + offset,
2528 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002529 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002530
2531 return (PyObject*) copy;
2532#else
2533 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2534 return NULL;
2535#endif
2536}
2537
2538static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002539pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002540{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002541#ifdef USE_BUILTIN_COPY
2542 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002543
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002544 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002545 if (!copy)
2546 return NULL;
2547
2548 if (!deepcopy(&copy->groupindex, memo) ||
2549 !deepcopy(&copy->indexgroup, memo) ||
2550 !deepcopy(&copy->pattern, memo)) {
2551 Py_DECREF(copy);
2552 return NULL;
2553 }
2554
2555#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002556 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2557 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002558#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002559}
2560
Raymond Hettinger94478742004-09-24 04:31:19 +00002561PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002562"match(string[, pos[, endpos]]) -> match object or None.\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002563 Matches zero or more characters at the beginning of the string");
2564
2565PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002566"search(string[, pos[, endpos]]) -> match object or None.\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002567 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002568 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002569
2570PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002571"split(string[, maxsplit = 0]) -> list.\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002572 Split string by the occurrences of pattern.");
2573
2574PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002575"findall(string[, pos[, endpos]]) -> list.\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002576 Return a list of all non-overlapping matches of pattern in string.");
2577
2578PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002579"finditer(string[, pos[, endpos]]) -> iterator.\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002580 Return an iterator over all non-overlapping matches for the \n\
2581 RE pattern in string. For each match, the iterator returns a\n\
2582 match object.");
2583
2584PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002585"sub(repl, string[, count = 0]) -> newstring.\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002586 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002587 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002588
2589PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002590"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002591 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2592 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002593 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002594
2595PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2596
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002597static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002598 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002599 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002600 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002601 pattern_search_doc},
2602 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2603 pattern_sub_doc},
2604 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2605 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002606 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002607 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002608 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002609 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002610#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002611 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2612 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002613#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002614 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002615 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2616 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002617 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002618};
2619
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002620#define PAT_OFF(x) offsetof(PatternObject, x)
2621static PyMemberDef pattern_members[] = {
2622 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2623 {"flags", T_INT, PAT_OFF(flags), READONLY},
2624 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2625 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2626 {NULL} /* Sentinel */
2627};
Guido van Rossumb700df92000-03-31 14:59:30 +00002628
Neal Norwitz57c179c2006-03-22 07:18:02 +00002629static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002630 PyVarObject_HEAD_INIT(NULL, 0)
2631 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002632 sizeof(PatternObject), sizeof(SRE_CODE),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002633 (destructor)pattern_dealloc, /* tp_dealloc */
2634 0, /* tp_print */
2635 0, /* tp_getattr */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002636 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00002637 0, /* tp_reserved */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002638 0, /* tp_repr */
2639 0, /* tp_as_number */
2640 0, /* tp_as_sequence */
2641 0, /* tp_as_mapping */
2642 0, /* tp_hash */
2643 0, /* tp_call */
2644 0, /* tp_str */
2645 0, /* tp_getattro */
2646 0, /* tp_setattro */
2647 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002648 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002649 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002650 0, /* tp_traverse */
2651 0, /* tp_clear */
2652 0, /* tp_richcompare */
2653 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002654 0, /* tp_iter */
2655 0, /* tp_iternext */
2656 pattern_methods, /* tp_methods */
2657 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002658};
2659
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002660static int _validate(PatternObject *self); /* Forward */
2661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002662static PyObject *
2663_compile(PyObject* self_, PyObject* args)
2664{
2665 /* "compile" pattern descriptor to pattern object */
2666
2667 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002668 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002669
2670 PyObject* pattern;
2671 int flags = 0;
2672 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002673 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002674 PyObject* groupindex = NULL;
2675 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002676
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002677 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002678 &PyList_Type, &code, &groups,
2679 &groupindex, &indexgroup))
2680 return NULL;
2681
2682 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002683 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002684 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2685 if (!self)
2686 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002687 self->weakreflist = NULL;
2688 self->pattern = NULL;
2689 self->groupindex = NULL;
2690 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002691 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002692
2693 self->codesize = n;
2694
2695 for (i = 0; i < n; i++) {
2696 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002697 unsigned long value = PyLong_AsUnsignedLong(o);
Antoine Pitrou39bdad82012-11-20 22:30:42 +01002698 if (value == (unsigned long)-1 && PyErr_Occurred()) {
2699 if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2700 PyErr_SetString(PyExc_OverflowError,
2701 "regular expression code size limit exceeded");
2702 }
2703 break;
2704 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002705 self->code[i] = (SRE_CODE) value;
2706 if ((unsigned long) self->code[i] != value) {
2707 PyErr_SetString(PyExc_OverflowError,
2708 "regular expression code size limit exceeded");
2709 break;
2710 }
2711 }
2712
2713 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002714 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002715 return NULL;
2716 }
2717
Benjamin Petersone48944b2012-03-07 14:50:25 -06002718 if (pattern == Py_None)
2719 self->charsize = -1;
2720 else {
2721 Py_ssize_t p_length;
2722 if (!getstring(pattern, &p_length, &self->charsize, &self->view)) {
2723 Py_DECREF(self);
2724 return NULL;
2725 }
2726 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002727
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002728 Py_INCREF(pattern);
2729 self->pattern = pattern;
2730
2731 self->flags = flags;
2732
2733 self->groups = groups;
2734
2735 Py_XINCREF(groupindex);
2736 self->groupindex = groupindex;
2737
2738 Py_XINCREF(indexgroup);
2739 self->indexgroup = indexgroup;
2740
2741 self->weakreflist = NULL;
2742
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002743 if (!_validate(self)) {
2744 Py_DECREF(self);
2745 return NULL;
2746 }
2747
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002748 return (PyObject*) self;
2749}
2750
Guido van Rossumb700df92000-03-31 14:59:30 +00002751/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002752/* Code validation */
2753
2754/* To learn more about this code, have a look at the _compile() function in
2755 Lib/sre_compile.py. The validation functions below checks the code array
2756 for conformance with the code patterns generated there.
2757
2758 The nice thing about the generated code is that it is position-independent:
2759 all jumps are relative jumps forward. Also, jumps don't cross each other:
2760 the target of a later jump is always earlier than the target of an earlier
2761 jump. IOW, this is okay:
2762
2763 J---------J-------T--------T
2764 \ \_____/ /
2765 \______________________/
2766
2767 but this is not:
2768
2769 J---------J-------T--------T
2770 \_________\_____/ /
2771 \____________/
2772
2773 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2774 bytes wide (the latter if Python is compiled for "wide" unicode support).
2775*/
2776
2777/* Defining this one enables tracing of the validator */
2778#undef VVERBOSE
2779
2780/* Trace macro for the validator */
2781#if defined(VVERBOSE)
2782#define VTRACE(v) printf v
2783#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002784#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002785#endif
2786
2787/* Report failure */
2788#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2789
2790/* Extract opcode, argument, or skip count from code array */
2791#define GET_OP \
2792 do { \
2793 VTRACE(("%p: ", code)); \
2794 if (code >= end) FAIL; \
2795 op = *code++; \
2796 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2797 } while (0)
2798#define GET_ARG \
2799 do { \
2800 VTRACE(("%p= ", code)); \
2801 if (code >= end) FAIL; \
2802 arg = *code++; \
2803 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2804 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002805#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002806 do { \
2807 VTRACE(("%p= ", code)); \
2808 if (code >= end) FAIL; \
2809 skip = *code; \
2810 VTRACE(("%lu (skip to %p)\n", \
2811 (unsigned long)skip, code+skip)); \
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002812 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002813 FAIL; \
2814 code++; \
2815 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002816#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002817
2818static int
2819_validate_charset(SRE_CODE *code, SRE_CODE *end)
2820{
2821 /* Some variables are manipulated by the macros above */
2822 SRE_CODE op;
2823 SRE_CODE arg;
2824 SRE_CODE offset;
2825 int i;
2826
2827 while (code < end) {
2828 GET_OP;
2829 switch (op) {
2830
2831 case SRE_OP_NEGATE:
2832 break;
2833
2834 case SRE_OP_LITERAL:
2835 GET_ARG;
2836 break;
2837
2838 case SRE_OP_RANGE:
2839 GET_ARG;
2840 GET_ARG;
2841 break;
2842
2843 case SRE_OP_CHARSET:
2844 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2845 if (code+offset < code || code+offset > end)
2846 FAIL;
2847 code += offset;
2848 break;
2849
2850 case SRE_OP_BIGCHARSET:
2851 GET_ARG; /* Number of blocks */
2852 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2853 if (code+offset < code || code+offset > end)
2854 FAIL;
2855 /* Make sure that each byte points to a valid block */
2856 for (i = 0; i < 256; i++) {
2857 if (((unsigned char *)code)[i] >= arg)
2858 FAIL;
2859 }
2860 code += offset;
2861 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2862 if (code+offset < code || code+offset > end)
2863 FAIL;
2864 code += offset;
2865 break;
2866
2867 case SRE_OP_CATEGORY:
2868 GET_ARG;
2869 switch (arg) {
2870 case SRE_CATEGORY_DIGIT:
2871 case SRE_CATEGORY_NOT_DIGIT:
2872 case SRE_CATEGORY_SPACE:
2873 case SRE_CATEGORY_NOT_SPACE:
2874 case SRE_CATEGORY_WORD:
2875 case SRE_CATEGORY_NOT_WORD:
2876 case SRE_CATEGORY_LINEBREAK:
2877 case SRE_CATEGORY_NOT_LINEBREAK:
2878 case SRE_CATEGORY_LOC_WORD:
2879 case SRE_CATEGORY_LOC_NOT_WORD:
2880 case SRE_CATEGORY_UNI_DIGIT:
2881 case SRE_CATEGORY_UNI_NOT_DIGIT:
2882 case SRE_CATEGORY_UNI_SPACE:
2883 case SRE_CATEGORY_UNI_NOT_SPACE:
2884 case SRE_CATEGORY_UNI_WORD:
2885 case SRE_CATEGORY_UNI_NOT_WORD:
2886 case SRE_CATEGORY_UNI_LINEBREAK:
2887 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2888 break;
2889 default:
2890 FAIL;
2891 }
2892 break;
2893
2894 default:
2895 FAIL;
2896
2897 }
2898 }
2899
2900 return 1;
2901}
2902
2903static int
2904_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2905{
2906 /* Some variables are manipulated by the macros above */
2907 SRE_CODE op;
2908 SRE_CODE arg;
2909 SRE_CODE skip;
2910
2911 VTRACE(("code=%p, end=%p\n", code, end));
2912
2913 if (code > end)
2914 FAIL;
2915
2916 while (code < end) {
2917 GET_OP;
2918 switch (op) {
2919
2920 case SRE_OP_MARK:
2921 /* We don't check whether marks are properly nested; the
2922 sre_match() code is robust even if they don't, and the worst
2923 you can get is nonsensical match results. */
2924 GET_ARG;
2925 if (arg > 2*groups+1) {
2926 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2927 FAIL;
2928 }
2929 break;
2930
2931 case SRE_OP_LITERAL:
2932 case SRE_OP_NOT_LITERAL:
2933 case SRE_OP_LITERAL_IGNORE:
2934 case SRE_OP_NOT_LITERAL_IGNORE:
2935 GET_ARG;
2936 /* The arg is just a character, nothing to check */
2937 break;
2938
2939 case SRE_OP_SUCCESS:
2940 case SRE_OP_FAILURE:
2941 /* Nothing to check; these normally end the matching process */
2942 break;
2943
2944 case SRE_OP_AT:
2945 GET_ARG;
2946 switch (arg) {
2947 case SRE_AT_BEGINNING:
2948 case SRE_AT_BEGINNING_STRING:
2949 case SRE_AT_BEGINNING_LINE:
2950 case SRE_AT_END:
2951 case SRE_AT_END_LINE:
2952 case SRE_AT_END_STRING:
2953 case SRE_AT_BOUNDARY:
2954 case SRE_AT_NON_BOUNDARY:
2955 case SRE_AT_LOC_BOUNDARY:
2956 case SRE_AT_LOC_NON_BOUNDARY:
2957 case SRE_AT_UNI_BOUNDARY:
2958 case SRE_AT_UNI_NON_BOUNDARY:
2959 break;
2960 default:
2961 FAIL;
2962 }
2963 break;
2964
2965 case SRE_OP_ANY:
2966 case SRE_OP_ANY_ALL:
2967 /* These have no operands */
2968 break;
2969
2970 case SRE_OP_IN:
2971 case SRE_OP_IN_IGNORE:
2972 GET_SKIP;
2973 /* Stop 1 before the end; we check the FAILURE below */
2974 if (!_validate_charset(code, code+skip-2))
2975 FAIL;
2976 if (code[skip-2] != SRE_OP_FAILURE)
2977 FAIL;
2978 code += skip-1;
2979 break;
2980
2981 case SRE_OP_INFO:
2982 {
2983 /* A minimal info field is
2984 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2985 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2986 more follows. */
2987 SRE_CODE flags, min, max, i;
2988 SRE_CODE *newcode;
2989 GET_SKIP;
2990 newcode = code+skip-1;
2991 GET_ARG; flags = arg;
2992 GET_ARG; min = arg;
2993 GET_ARG; max = arg;
2994 /* Check that only valid flags are present */
2995 if ((flags & ~(SRE_INFO_PREFIX |
2996 SRE_INFO_LITERAL |
2997 SRE_INFO_CHARSET)) != 0)
2998 FAIL;
2999 /* PREFIX and CHARSET are mutually exclusive */
3000 if ((flags & SRE_INFO_PREFIX) &&
3001 (flags & SRE_INFO_CHARSET))
3002 FAIL;
3003 /* LITERAL implies PREFIX */
3004 if ((flags & SRE_INFO_LITERAL) &&
3005 !(flags & SRE_INFO_PREFIX))
3006 FAIL;
3007 /* Validate the prefix */
3008 if (flags & SRE_INFO_PREFIX) {
3009 SRE_CODE prefix_len, prefix_skip;
3010 GET_ARG; prefix_len = arg;
3011 GET_ARG; prefix_skip = arg;
3012 /* Here comes the prefix string */
3013 if (code+prefix_len < code || code+prefix_len > newcode)
3014 FAIL;
3015 code += prefix_len;
3016 /* And here comes the overlap table */
3017 if (code+prefix_len < code || code+prefix_len > newcode)
3018 FAIL;
3019 /* Each overlap value should be < prefix_len */
3020 for (i = 0; i < prefix_len; i++) {
3021 if (code[i] >= prefix_len)
3022 FAIL;
3023 }
3024 code += prefix_len;
3025 }
3026 /* Validate the charset */
3027 if (flags & SRE_INFO_CHARSET) {
3028 if (!_validate_charset(code, newcode-1))
3029 FAIL;
3030 if (newcode[-1] != SRE_OP_FAILURE)
3031 FAIL;
3032 code = newcode;
3033 }
3034 else if (code != newcode) {
3035 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3036 FAIL;
3037 }
3038 }
3039 break;
3040
3041 case SRE_OP_BRANCH:
3042 {
3043 SRE_CODE *target = NULL;
3044 for (;;) {
3045 GET_SKIP;
3046 if (skip == 0)
3047 break;
3048 /* Stop 2 before the end; we check the JUMP below */
3049 if (!_validate_inner(code, code+skip-3, groups))
3050 FAIL;
3051 code += skip-3;
3052 /* Check that it ends with a JUMP, and that each JUMP
3053 has the same target */
3054 GET_OP;
3055 if (op != SRE_OP_JUMP)
3056 FAIL;
3057 GET_SKIP;
3058 if (target == NULL)
3059 target = code+skip-1;
3060 else if (code+skip-1 != target)
3061 FAIL;
3062 }
3063 }
3064 break;
3065
3066 case SRE_OP_REPEAT_ONE:
3067 case SRE_OP_MIN_REPEAT_ONE:
3068 {
3069 SRE_CODE min, max;
3070 GET_SKIP;
3071 GET_ARG; min = arg;
3072 GET_ARG; max = arg;
3073 if (min > max)
3074 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003075 if (max > 65535)
3076 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003077 if (!_validate_inner(code, code+skip-4, groups))
3078 FAIL;
3079 code += skip-4;
3080 GET_OP;
3081 if (op != SRE_OP_SUCCESS)
3082 FAIL;
3083 }
3084 break;
3085
3086 case SRE_OP_REPEAT:
3087 {
3088 SRE_CODE min, max;
3089 GET_SKIP;
3090 GET_ARG; min = arg;
3091 GET_ARG; max = arg;
3092 if (min > max)
3093 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003094 if (max > 65535)
3095 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003096 if (!_validate_inner(code, code+skip-3, groups))
3097 FAIL;
3098 code += skip-3;
3099 GET_OP;
3100 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3101 FAIL;
3102 }
3103 break;
3104
3105 case SRE_OP_GROUPREF:
3106 case SRE_OP_GROUPREF_IGNORE:
3107 GET_ARG;
3108 if (arg >= groups)
3109 FAIL;
3110 break;
3111
3112 case SRE_OP_GROUPREF_EXISTS:
3113 /* The regex syntax for this is: '(?(group)then|else)', where
3114 'group' is either an integer group number or a group name,
3115 'then' and 'else' are sub-regexes, and 'else' is optional. */
3116 GET_ARG;
3117 if (arg >= groups)
3118 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003119 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003120 code--; /* The skip is relative to the first arg! */
3121 /* There are two possibilities here: if there is both a 'then'
3122 part and an 'else' part, the generated code looks like:
3123
3124 GROUPREF_EXISTS
3125 <group>
3126 <skipyes>
3127 ...then part...
3128 JUMP
3129 <skipno>
3130 (<skipyes> jumps here)
3131 ...else part...
3132 (<skipno> jumps here)
3133
3134 If there is only a 'then' part, it looks like:
3135
3136 GROUPREF_EXISTS
3137 <group>
3138 <skip>
3139 ...then part...
3140 (<skip> jumps here)
3141
3142 There is no direct way to decide which it is, and we don't want
3143 to allow arbitrary jumps anywhere in the code; so we just look
3144 for a JUMP opcode preceding our skip target.
3145 */
3146 if (skip >= 3 && code+skip-3 >= code &&
3147 code[skip-3] == SRE_OP_JUMP)
3148 {
3149 VTRACE(("both then and else parts present\n"));
3150 if (!_validate_inner(code+1, code+skip-3, groups))
3151 FAIL;
3152 code += skip-2; /* Position after JUMP, at <skipno> */
3153 GET_SKIP;
3154 if (!_validate_inner(code, code+skip-1, groups))
3155 FAIL;
3156 code += skip-1;
3157 }
3158 else {
3159 VTRACE(("only a then part present\n"));
3160 if (!_validate_inner(code+1, code+skip-1, groups))
3161 FAIL;
3162 code += skip-1;
3163 }
3164 break;
3165
3166 case SRE_OP_ASSERT:
3167 case SRE_OP_ASSERT_NOT:
3168 GET_SKIP;
3169 GET_ARG; /* 0 for lookahead, width for lookbehind */
3170 code--; /* Back up over arg to simplify math below */
3171 if (arg & 0x80000000)
3172 FAIL; /* Width too large */
3173 /* Stop 1 before the end; we check the SUCCESS below */
3174 if (!_validate_inner(code+1, code+skip-2, groups))
3175 FAIL;
3176 code += skip-2;
3177 GET_OP;
3178 if (op != SRE_OP_SUCCESS)
3179 FAIL;
3180 break;
3181
3182 default:
3183 FAIL;
3184
3185 }
3186 }
3187
3188 VTRACE(("okay\n"));
3189 return 1;
3190}
3191
3192static int
3193_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3194{
3195 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3196 FAIL;
3197 if (groups == 0) /* fix for simplejson */
3198 groups = 100; /* 100 groups should always be safe */
3199 return _validate_inner(code, end-1, groups);
3200}
3201
3202static int
3203_validate(PatternObject *self)
3204{
3205 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3206 {
3207 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3208 return 0;
3209 }
3210 else
3211 VTRACE(("Success!\n"));
3212 return 1;
3213}
3214
3215/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003216/* match methods */
3217
3218static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003219match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003220{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003221 Py_XDECREF(self->regs);
3222 Py_XDECREF(self->string);
3223 Py_DECREF(self->pattern);
3224 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003225}
3226
3227static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003228match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003229{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003230 if (index < 0 || index >= self->groups) {
3231 /* raise IndexError if we were given a bad group number */
3232 PyErr_SetString(
3233 PyExc_IndexError,
3234 "no such group"
3235 );
3236 return NULL;
3237 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003238
Fredrik Lundh6f013982000-07-03 18:44:21 +00003239 index *= 2;
3240
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003241 if (self->string == Py_None || self->mark[index] < 0) {
3242 /* return default value if the string or group is undefined */
3243 Py_INCREF(def);
3244 return def;
3245 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003246
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003247 return PySequence_GetSlice(
3248 self->string, self->mark[index], self->mark[index+1]
3249 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003250}
3251
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003252static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003253match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003254{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003255 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003256
Guido van Rossumddefaf32007-01-14 03:31:43 +00003257 if (index == NULL)
3258 /* Default value */
3259 return 0;
3260
Christian Heimes217cfd12007-12-02 14:31:20 +00003261 if (PyLong_Check(index))
3262 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003263
Fredrik Lundh6f013982000-07-03 18:44:21 +00003264 i = -1;
3265
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003266 if (self->pattern->groupindex) {
3267 index = PyObject_GetItem(self->pattern->groupindex, index);
3268 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003269 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003270 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003271 Py_DECREF(index);
3272 } else
3273 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003274 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003275
3276 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003277}
3278
3279static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003280match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003281{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003282 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003283}
3284
3285static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003286match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003287{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003288 /* delegate to Python code */
3289 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003290 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003291 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003292 );
3293}
3294
3295static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003296match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003297{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003299 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003300
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003301 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003302
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 switch (size) {
3304 case 0:
3305 result = match_getslice(self, Py_False, Py_None);
3306 break;
3307 case 1:
3308 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3309 break;
3310 default:
3311 /* fetch multiple items */
3312 result = PyTuple_New(size);
3313 if (!result)
3314 return NULL;
3315 for (i = 0; i < size; i++) {
3316 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003317 self, PyTuple_GET_ITEM(args, i), Py_None
3318 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003319 if (!item) {
3320 Py_DECREF(result);
3321 return NULL;
3322 }
3323 PyTuple_SET_ITEM(result, i, item);
3324 }
3325 break;
3326 }
3327 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003328}
3329
3330static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003331match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003332{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003333 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003334 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003336 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003337 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003338 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003339 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003340
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003341 result = PyTuple_New(self->groups-1);
3342 if (!result)
3343 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003345 for (index = 1; index < self->groups; index++) {
3346 PyObject* item;
3347 item = match_getslice_by_index(self, index, def);
3348 if (!item) {
3349 Py_DECREF(result);
3350 return NULL;
3351 }
3352 PyTuple_SET_ITEM(result, index-1, item);
3353 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003354
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003355 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003356}
3357
3358static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003359match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003360{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003361 PyObject* result;
3362 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003363 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003366 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003367 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003368 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003370 result = PyDict_New();
3371 if (!result || !self->pattern->groupindex)
3372 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003374 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003375 if (!keys)
3376 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003377
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003378 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003379 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003380 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003381 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003382 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003383 if (!key)
3384 goto failed;
3385 value = match_getslice(self, key, def);
3386 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003387 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003388 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003389 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003390 status = PyDict_SetItem(result, key, value);
3391 Py_DECREF(value);
3392 if (status < 0)
3393 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003394 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003395
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003396 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003397
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003398 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003399
3400failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003401 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003402 Py_DECREF(result);
3403 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003404}
3405
3406static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003407match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003408{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003409 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003410
Guido van Rossumddefaf32007-01-14 03:31:43 +00003411 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003412 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003413 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003414
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003415 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003416
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003417 if (index < 0 || index >= self->groups) {
3418 PyErr_SetString(
3419 PyExc_IndexError,
3420 "no such group"
3421 );
3422 return NULL;
3423 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003424
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003425 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003426 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003427}
3428
3429static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003430match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003431{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003432 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003433
Guido van Rossumddefaf32007-01-14 03:31:43 +00003434 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003435 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003436 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003437
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003438 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003439
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003440 if (index < 0 || index >= self->groups) {
3441 PyErr_SetString(
3442 PyExc_IndexError,
3443 "no such group"
3444 );
3445 return NULL;
3446 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003447
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003448 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003449 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003450}
3451
3452LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003453_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003454{
3455 PyObject* pair;
3456 PyObject* item;
3457
3458 pair = PyTuple_New(2);
3459 if (!pair)
3460 return NULL;
3461
Christian Heimes217cfd12007-12-02 14:31:20 +00003462 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003463 if (!item)
3464 goto error;
3465 PyTuple_SET_ITEM(pair, 0, item);
3466
Christian Heimes217cfd12007-12-02 14:31:20 +00003467 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003468 if (!item)
3469 goto error;
3470 PyTuple_SET_ITEM(pair, 1, item);
3471
3472 return pair;
3473
3474 error:
3475 Py_DECREF(pair);
3476 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003477}
3478
3479static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003480match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003481{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003482 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003483
Guido van Rossumddefaf32007-01-14 03:31:43 +00003484 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003485 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003486 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003487
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003488 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003489
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003490 if (index < 0 || index >= self->groups) {
3491 PyErr_SetString(
3492 PyExc_IndexError,
3493 "no such group"
3494 );
3495 return NULL;
3496 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003497
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003498 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003499 return _pair(self->mark[index*2], self->mark[index*2+1]);
3500}
3501
3502static PyObject*
3503match_regs(MatchObject* self)
3504{
3505 PyObject* regs;
3506 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003507 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003508
3509 regs = PyTuple_New(self->groups);
3510 if (!regs)
3511 return NULL;
3512
3513 for (index = 0; index < self->groups; index++) {
3514 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3515 if (!item) {
3516 Py_DECREF(regs);
3517 return NULL;
3518 }
3519 PyTuple_SET_ITEM(regs, index, item);
3520 }
3521
3522 Py_INCREF(regs);
3523 self->regs = regs;
3524
3525 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003526}
3527
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003528static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003529match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003530{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003531#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003532 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003533 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003534
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003535 slots = 2 * (self->pattern->groups+1);
3536
3537 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3538 if (!copy)
3539 return NULL;
3540
3541 /* this value a constant, but any compiler should be able to
3542 figure that out all by itself */
3543 offset = offsetof(MatchObject, string);
3544
3545 Py_XINCREF(self->pattern);
3546 Py_XINCREF(self->string);
3547 Py_XINCREF(self->regs);
3548
3549 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003550 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003551
3552 return (PyObject*) copy;
3553#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003554 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003555 return NULL;
3556#endif
3557}
3558
3559static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003560match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003561{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003562#ifdef USE_BUILTIN_COPY
3563 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003564
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003565 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003566 if (!copy)
3567 return NULL;
3568
3569 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3570 !deepcopy(&copy->string, memo) ||
3571 !deepcopy(&copy->regs, memo)) {
3572 Py_DECREF(copy);
3573 return NULL;
3574 }
3575
3576#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003577 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3578 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003579#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003580}
3581
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003582PyDoc_STRVAR(match_doc,
3583"The result of re.match() and re.search().\n\
3584Match objects always have a boolean value of True.");
3585
3586PyDoc_STRVAR(match_group_doc,
3587"group([group1, ...]) -> str or tuple.\n\n\
3588 Return subgroup(s) of the match by indices or names.\n\
3589 For 0 returns the entire match.");
3590
3591PyDoc_STRVAR(match_start_doc,
3592"start([group=0]) -> int.\n\n\
3593 Return index of the start of the substring matched by group.");
3594
3595PyDoc_STRVAR(match_end_doc,
3596"end([group=0]) -> int.\n\n\
3597 Return index of the end of the substring matched by group.");
3598
3599PyDoc_STRVAR(match_span_doc,
3600"span([group]) -> tuple.\n\n\
3601 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3602
3603PyDoc_STRVAR(match_groups_doc,
3604"groups([default=None]) -> tuple.\n\n\
3605 Return a tuple containing all the subgroups of the match, from 1.\n\
3606 The default argument is used for groups\n\
3607 that did not participate in the match");
3608
3609PyDoc_STRVAR(match_groupdict_doc,
3610"groupdict([default=None]) -> dict.\n\n\
3611 Return a dictionary containing all the named subgroups of the match,\n\
3612 keyed by the subgroup name. The default argument is used for groups\n\
3613 that did not participate in the match");
3614
3615PyDoc_STRVAR(match_expand_doc,
3616"expand(template) -> str.\n\n\
3617 Return the string obtained by doing backslash substitution\n\
3618 on the string template, as done by the sub() method.");
3619
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003620static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003621 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3622 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3623 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3624 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3625 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3626 match_groups_doc},
3627 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3628 match_groupdict_doc},
3629 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003630 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3631 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003632 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003633};
3634
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003635static PyObject *
3636match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003637{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003638 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003639 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003640 Py_INCREF(Py_None);
3641 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003642}
3643
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003644static PyObject *
3645match_lastgroup_get(MatchObject *self)
3646{
3647 if (self->pattern->indexgroup && self->lastindex >= 0) {
3648 PyObject* result = PySequence_GetItem(
3649 self->pattern->indexgroup, self->lastindex
3650 );
3651 if (result)
3652 return result;
3653 PyErr_Clear();
3654 }
3655 Py_INCREF(Py_None);
3656 return Py_None;
3657}
3658
3659static PyObject *
3660match_regs_get(MatchObject *self)
3661{
3662 if (self->regs) {
3663 Py_INCREF(self->regs);
3664 return self->regs;
3665 } else
3666 return match_regs(self);
3667}
3668
3669static PyGetSetDef match_getset[] = {
3670 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3671 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3672 {"regs", (getter)match_regs_get, (setter)NULL},
3673 {NULL}
3674};
3675
3676#define MATCH_OFF(x) offsetof(MatchObject, x)
3677static PyMemberDef match_members[] = {
3678 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3679 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3680 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3681 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3682 {NULL}
3683};
3684
Guido van Rossumb700df92000-03-31 14:59:30 +00003685/* FIXME: implement setattr("string", None) as a special case (to
3686 detach the associated string, if any */
3687
Neal Norwitz57c179c2006-03-22 07:18:02 +00003688static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003689 PyVarObject_HEAD_INIT(NULL,0)
3690 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003691 sizeof(MatchObject), sizeof(Py_ssize_t),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003692 (destructor)match_dealloc, /* tp_dealloc */
3693 0, /* tp_print */
3694 0, /* tp_getattr */
3695 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003696 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003697 0, /* tp_repr */
3698 0, /* tp_as_number */
3699 0, /* tp_as_sequence */
3700 0, /* tp_as_mapping */
3701 0, /* tp_hash */
3702 0, /* tp_call */
3703 0, /* tp_str */
3704 0, /* tp_getattro */
3705 0, /* tp_setattro */
3706 0, /* tp_as_buffer */
3707 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003708 match_doc, /* tp_doc */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003709 0, /* tp_traverse */
3710 0, /* tp_clear */
3711 0, /* tp_richcompare */
3712 0, /* tp_weaklistoffset */
3713 0, /* tp_iter */
3714 0, /* tp_iternext */
3715 match_methods, /* tp_methods */
3716 match_members, /* tp_members */
3717 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003718};
3719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003720static PyObject*
3721pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3722{
3723 /* create match object (from state object) */
3724
3725 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003726 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003727 char* base;
3728 int n;
3729
3730 if (status > 0) {
3731
3732 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003733 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003734 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3735 2*(pattern->groups+1));
3736 if (!match)
3737 return NULL;
3738
3739 Py_INCREF(pattern);
3740 match->pattern = pattern;
3741
3742 Py_INCREF(state->string);
3743 match->string = state->string;
3744
3745 match->regs = NULL;
3746 match->groups = pattern->groups+1;
3747
3748 /* fill in group slices */
3749
3750 base = (char*) state->beginning;
3751 n = state->charsize;
3752
3753 match->mark[0] = ((char*) state->start - base) / n;
3754 match->mark[1] = ((char*) state->ptr - base) / n;
3755
3756 for (i = j = 0; i < pattern->groups; i++, j+=2)
3757 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3758 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3759 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3760 } else
3761 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3762
3763 match->pos = state->pos;
3764 match->endpos = state->endpos;
3765
3766 match->lastindex = state->lastindex;
3767
3768 return (PyObject*) match;
3769
3770 } else if (status == 0) {
3771
3772 /* no match */
3773 Py_INCREF(Py_None);
3774 return Py_None;
3775
3776 }
3777
3778 /* internal error */
3779 pattern_error(status);
3780 return NULL;
3781}
3782
3783
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003784/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003785/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003786
3787static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003788scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003789{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003790 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003791 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003792 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003793}
3794
3795static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003796scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003797{
3798 SRE_STATE* state = &self->state;
3799 PyObject* match;
3800 int status;
3801
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003802 state_reset(state);
3803
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003804 state->ptr = state->start;
3805
3806 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003807 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003808 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003809#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003810 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003811#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003812 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003813 if (PyErr_Occurred())
3814 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003815
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003816 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003817 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003818
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003819 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003820 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003821 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003822 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003823
3824 return match;
3825}
3826
3827
3828static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003829scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003830{
3831 SRE_STATE* state = &self->state;
3832 PyObject* match;
3833 int status;
3834
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003835 state_reset(state);
3836
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003837 state->ptr = state->start;
3838
3839 if (state->charsize == 1) {
3840 status = sre_search(state, PatternObject_GetCode(self->pattern));
3841 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003842#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003843 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003844#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003845 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003846 if (PyErr_Occurred())
3847 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003848
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003849 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003850 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003851
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003852 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003853 state->start = (void*) ((char*) state->ptr + state->charsize);
3854 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003855 state->start = state->ptr;
3856
3857 return match;
3858}
3859
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003860static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003861 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3862 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003863 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003864};
3865
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003866#define SCAN_OFF(x) offsetof(ScannerObject, x)
3867static PyMemberDef scanner_members[] = {
3868 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3869 {NULL} /* Sentinel */
3870};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003871
Neal Norwitz57c179c2006-03-22 07:18:02 +00003872static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003873 PyVarObject_HEAD_INIT(NULL, 0)
3874 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003875 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003876 (destructor)scanner_dealloc,/* tp_dealloc */
3877 0, /* tp_print */
3878 0, /* tp_getattr */
3879 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003880 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003881 0, /* tp_repr */
3882 0, /* tp_as_number */
3883 0, /* tp_as_sequence */
3884 0, /* tp_as_mapping */
3885 0, /* tp_hash */
3886 0, /* tp_call */
3887 0, /* tp_str */
3888 0, /* tp_getattro */
3889 0, /* tp_setattro */
3890 0, /* tp_as_buffer */
3891 Py_TPFLAGS_DEFAULT, /* tp_flags */
3892 0, /* tp_doc */
3893 0, /* tp_traverse */
3894 0, /* tp_clear */
3895 0, /* tp_richcompare */
3896 0, /* tp_weaklistoffset */
3897 0, /* tp_iter */
3898 0, /* tp_iternext */
3899 scanner_methods, /* tp_methods */
3900 scanner_members, /* tp_members */
3901 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003902};
3903
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003904static PyObject*
3905pattern_scanner(PatternObject* pattern, PyObject* args)
3906{
3907 /* create search state object */
3908
3909 ScannerObject* self;
3910
3911 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003912 Py_ssize_t start = 0;
3913 Py_ssize_t end = PY_SSIZE_T_MAX;
3914 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003915 return NULL;
3916
3917 /* create scanner object */
3918 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3919 if (!self)
3920 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003921 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003922
3923 string = state_init(&self->state, pattern, string, start, end);
3924 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003925 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003926 return NULL;
3927 }
3928
3929 Py_INCREF(pattern);
3930 self->pattern = (PyObject*) pattern;
3931
3932 return (PyObject*) self;
3933}
3934
Guido van Rossumb700df92000-03-31 14:59:30 +00003935static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003936 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003937 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003938 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003939 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003940};
3941
Martin v. Löwis1a214512008-06-11 05:26:20 +00003942static struct PyModuleDef sremodule = {
3943 PyModuleDef_HEAD_INIT,
3944 "_" SRE_MODULE,
3945 NULL,
3946 -1,
3947 _functions,
3948 NULL,
3949 NULL,
3950 NULL,
3951 NULL
3952};
3953
3954PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003955{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003956 PyObject* m;
3957 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003958 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003959
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003960 /* Patch object types */
3961 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3962 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003963 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003964
Martin v. Löwis1a214512008-06-11 05:26:20 +00003965 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003966 if (m == NULL)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003967 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003968 d = PyModule_GetDict(m);
3969
Christian Heimes217cfd12007-12-02 14:31:20 +00003970 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003971 if (x) {
3972 PyDict_SetItemString(d, "MAGIC", x);
3973 Py_DECREF(x);
3974 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003975
Christian Heimes217cfd12007-12-02 14:31:20 +00003976 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003977 if (x) {
3978 PyDict_SetItemString(d, "CODESIZE", x);
3979 Py_DECREF(x);
3980 }
3981
Neal Norwitzfe537132007-08-26 03:55:15 +00003982 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003983 if (x) {
3984 PyDict_SetItemString(d, "copyright", x);
3985 Py_DECREF(x);
3986 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003987 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003988}
3989
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003990#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003991
3992/* vim:ts=4:sw=4:et
3993*/