blob: 472b5a3797d8fa1e59a3dedce64c1e5f6401b6bd [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d52000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d52000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
166#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000167
Mark Dickinson1f268282009-07-28 17:22:36 +0000168#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000169#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
170#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000171#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000172#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000173
174static unsigned int sre_lower_unicode(unsigned int ch)
175{
176 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
177}
178
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000179#endif
180
Guido van Rossumb700df92000-03-31 14:59:30 +0000181LOCAL(int)
182sre_category(SRE_CODE category, unsigned int ch)
183{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000184 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000185
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 case SRE_CATEGORY_DIGIT:
187 return SRE_IS_DIGIT(ch);
188 case SRE_CATEGORY_NOT_DIGIT:
189 return !SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_SPACE:
191 return SRE_IS_SPACE(ch);
192 case SRE_CATEGORY_NOT_SPACE:
193 return !SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_WORD:
195 return SRE_IS_WORD(ch);
196 case SRE_CATEGORY_NOT_WORD:
197 return !SRE_IS_WORD(ch);
198 case SRE_CATEGORY_LINEBREAK:
199 return SRE_IS_LINEBREAK(ch);
200 case SRE_CATEGORY_NOT_LINEBREAK:
201 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000202
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000203 case SRE_CATEGORY_LOC_WORD:
204 return SRE_LOC_IS_WORD(ch);
205 case SRE_CATEGORY_LOC_NOT_WORD:
206 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000207
208#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000209 case SRE_CATEGORY_UNI_DIGIT:
210 return SRE_UNI_IS_DIGIT(ch);
211 case SRE_CATEGORY_UNI_NOT_DIGIT:
212 return !SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_SPACE:
214 return SRE_UNI_IS_SPACE(ch);
215 case SRE_CATEGORY_UNI_NOT_SPACE:
216 return !SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_WORD:
218 return SRE_UNI_IS_WORD(ch);
219 case SRE_CATEGORY_UNI_NOT_WORD:
220 return !SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_LINEBREAK:
222 return SRE_UNI_IS_LINEBREAK(ch);
223 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
224 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000225#else
226 case SRE_CATEGORY_UNI_DIGIT:
227 return SRE_IS_DIGIT(ch);
228 case SRE_CATEGORY_UNI_NOT_DIGIT:
229 return !SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_SPACE:
231 return SRE_IS_SPACE(ch);
232 case SRE_CATEGORY_UNI_NOT_SPACE:
233 return !SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_WORD:
235 return SRE_LOC_IS_WORD(ch);
236 case SRE_CATEGORY_UNI_NOT_WORD:
237 return !SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_LINEBREAK:
239 return SRE_IS_LINEBREAK(ch);
240 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
241 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000242#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000243 }
244 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000245}
246
247/* helpers */
248
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000250data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000255 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000257}
258
259static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000260data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000261{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000262 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 minsize = state->data_stack_base+size;
264 cursize = state->data_stack_size;
265 if (cursize < minsize) {
266 void* stack;
267 cursize = minsize+minsize/4+1024;
268 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000270 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 return SRE_ERROR_MEMORY;
273 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000276 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000277 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000278}
279
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000280/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000281
282#define SRE_CHAR unsigned char
283#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000284#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000285#define SRE_CHARSET sre_charset
286#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000287#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000288#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000290#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000291
292#if defined(HAVE_UNICODE)
293
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000295#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000297
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000298#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#undef SRE_SEARCH
300#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000301#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000302#undef SRE_INFO
303#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000304#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000305#undef SRE_AT
306#undef SRE_CHAR
307
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000308/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000309
310#define SRE_CHAR Py_UNICODE
311#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000312#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000313#define SRE_CHARSET sre_ucharset
314#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000315#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000316#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000318#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000319#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000320
321#endif /* SRE_RECURSIVE */
322
323/* -------------------------------------------------------------------- */
324/* String matching engine */
325
326/* the following section is compiled twice, with different character
327 settings */
328
329LOCAL(int)
330SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
331{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000332 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000333
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000334 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000339 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 case SRE_AT_BEGINNING_LINE:
343 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000344 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000345
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000347 return (((void*) (ptr+1) == state->end &&
348 SRE_IS_LINEBREAK((int) ptr[0])) ||
349 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000350
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000351 case SRE_AT_END_LINE:
352 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000353 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000354
Fredrik Lundh770617b2001-01-14 15:06:11 +0000355 case SRE_AT_END_STRING:
356 return ((void*) ptr == state->end);
357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_BOUNDARY:
359 if (state->beginning == state->end)
360 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000362 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000366
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000367 case SRE_AT_NON_BOUNDARY:
368 if (state->beginning == state->end)
369 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000373 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000375
376 case SRE_AT_LOC_BOUNDARY:
377 if (state->beginning == state->end)
378 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000380 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000384
385 case SRE_AT_LOC_NON_BOUNDARY:
386 if (state->beginning == state->end)
387 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000390 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000392 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000393
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000394#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000395 case SRE_AT_UNI_BOUNDARY:
396 if (state->beginning == state->end)
397 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000398 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000399 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000400 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000401 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000403
404 case SRE_AT_UNI_NON_BOUNDARY:
405 if (state->beginning == state->end)
406 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000407 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000409 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000411 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000412#endif
413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000414 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000417}
418
419LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000420SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000421{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000423
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 for (;;) {
427 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000428
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000429 case SRE_OP_FAILURE:
430 return !ok;
431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000432 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000433 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 if (ch == set[0])
435 return ok;
436 set++;
437 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000438
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000439 case SRE_OP_CATEGORY:
440 /* <CATEGORY> <code> */
441 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000442 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000443 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Fredrik Lundh3562f112000-07-02 12:00:07 +0000446 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000447 if (sizeof(SRE_CODE) == 2) {
448 /* <CHARSET> <bitmap> (16 bits per code word) */
449 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
450 return ok;
451 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000452 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000453 else {
454 /* <CHARSET> <bitmap> (32 bits per code word) */
455 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
456 return ok;
457 set += 8;
458 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000459 break;
460
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000461 case SRE_OP_RANGE:
462 /* <RANGE> <lower> <upper> */
463 if (set[0] <= ch && ch <= set[1])
464 return ok;
465 set += 2;
466 break;
467
468 case SRE_OP_NEGATE:
469 ok = !ok;
470 break;
471
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000472 case SRE_OP_BIGCHARSET:
473 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
474 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000475 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000476 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000477
478 if (sizeof(SRE_CODE) == 2) {
479 block = ((unsigned char*)set)[ch >> 8];
480 set += 128;
481 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
482 return ok;
483 set += count*16;
484 }
485 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000486 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
487 * warnings when c's type supports only numbers < N+1 */
488 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000489 block = ((unsigned char*)set)[ch >> 8];
490 else
491 block = -1;
492 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000493 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000494 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
495 return ok;
496 set += count*8;
497 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000498 break;
499 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000500
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 default:
502 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000503 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 return 0;
505 }
506 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000507}
508
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000509LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000510
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000511LOCAL(Py_ssize_t)
512SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513{
514 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000515 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
516 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000517 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000518
519 /* adjust end */
520 if (maxcount < end - ptr && maxcount != 65535)
521 end = ptr + maxcount;
522
523 switch (pattern[0]) {
524
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000525 case SRE_OP_IN:
526 /* repeated set */
527 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
528 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
529 ptr++;
530 break;
531
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 case SRE_OP_ANY:
533 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000534 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000535 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
536 ptr++;
537 break;
538
539 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000540 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000542 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 ptr = end;
544 break;
545
546 case SRE_OP_LITERAL:
547 /* repeated literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 while (ptr < end && (SRE_CODE) *ptr == chr)
551 ptr++;
552 break;
553
554 case SRE_OP_LITERAL_IGNORE:
555 /* repeated literal */
556 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000557 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000558 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
559 ptr++;
560 break;
561
562 case SRE_OP_NOT_LITERAL:
563 /* repeated non-literal */
564 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000565 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000566 while (ptr < end && (SRE_CODE) *ptr != chr)
567 ptr++;
568 break;
Tim Peters3d563502006-01-21 02:47:53 +0000569
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000570 case SRE_OP_NOT_LITERAL_IGNORE:
571 /* repeated non-literal */
572 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000573 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000574 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
575 ptr++;
576 break;
577
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000578 default:
579 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000582 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 if (i < 0)
584 return i;
585 if (!i)
586 break;
587 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000588 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
589 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 return (SRE_CHAR*) state->ptr - ptr;
591 }
592
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000593 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000594 return ptr - (SRE_CHAR*) state->ptr;
595}
596
Fredrik Lundh33accc12000-08-27 20:59:47 +0000597#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000598LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000599SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
600{
601 /* check if an SRE_OP_INFO block matches at the current position.
602 returns the number of SRE_CODE objects to skip if successful, 0
603 if no match */
604
605 SRE_CHAR* end = state->end;
606 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000607 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000608
609 /* check minimal length */
610 if (pattern[3] && (end - ptr) < pattern[3])
611 return 0;
612
613 /* check known prefix */
614 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
615 /* <length> <skip> <prefix data> <overlap data> */
616 for (i = 0; i < pattern[5]; i++)
617 if ((SRE_CODE) ptr[i] != pattern[7 + i])
618 return 0;
619 return pattern[0] + 2 * pattern[6];
620 }
621 return pattern[0];
622}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000623#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000624
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000625/* The macros below should be used to protect recursive SRE_MATCH()
626 * calls that *failed* and do *not* return immediately (IOW, those
627 * that will backtrack). Explaining:
628 *
629 * - Recursive SRE_MATCH() returned true: that's usually a success
630 * (besides atypical cases like ASSERT_NOT), therefore there's no
631 * reason to restore lastmark;
632 *
633 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
634 * is returning to the caller: If the current SRE_MATCH() is the
635 * top function of the recursion, returning false will be a matching
636 * failure, and it doesn't matter where lastmark is pointing to.
637 * If it's *not* the top function, it will be a recursive SRE_MATCH()
638 * failure by itself, and the calling SRE_MATCH() will have to deal
639 * with the failure by the same rules explained here (it will restore
640 * lastmark by itself if necessary);
641 *
642 * - Recursive SRE_MATCH() returned false, and will continue the
643 * outside 'for' loop: must be protected when breaking, since the next
644 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000645 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000646 * - Recursive SRE_MATCH() returned false, and will be called again
647 * inside a local for/while loop: must be protected between each
648 * loop iteration, since the recursive SRE_MATCH() could do anything,
649 * and could potentially depend on lastmark.
650 *
651 * For more information, check the discussion at SF patch #712900.
652 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000653#define LASTMARK_SAVE() \
654 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000655 ctx->lastmark = state->lastmark; \
656 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000657 } while (0)
658#define LASTMARK_RESTORE() \
659 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000660 state->lastmark = ctx->lastmark; \
661 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000662 } while (0)
663
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000664#define RETURN_ERROR(i) do { return i; } while(0)
665#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
666#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
667
668#define RETURN_ON_ERROR(i) \
669 do { if (i < 0) RETURN_ERROR(i); } while (0)
670#define RETURN_ON_SUCCESS(i) \
671 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
672#define RETURN_ON_FAILURE(i) \
673 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
674
675#define SFY(x) #x
676
677#define DATA_STACK_ALLOC(state, type, ptr) \
678do { \
679 alloc_pos = state->data_stack_base; \
680 TRACE(("allocating %s in %d (%d)\n", \
681 SFY(type), alloc_pos, sizeof(type))); \
682 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
683 int j = data_stack_grow(state, sizeof(type)); \
684 if (j < 0) return j; \
685 if (ctx_pos != -1) \
686 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
687 } \
688 ptr = (type*)(state->data_stack+alloc_pos); \
689 state->data_stack_base += sizeof(type); \
690} while (0)
691
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000692#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
693do { \
694 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
695 ptr = (type*)(state->data_stack+pos); \
696} while (0)
697
698#define DATA_STACK_PUSH(state, data, size) \
699do { \
700 TRACE(("copy data in %p to %d (%d)\n", \
701 data, state->data_stack_base, size)); \
702 if (state->data_stack_size < state->data_stack_base+size) { \
703 int j = data_stack_grow(state, size); \
704 if (j < 0) return j; \
705 if (ctx_pos != -1) \
706 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
707 } \
708 memcpy(state->data_stack+state->data_stack_base, data, size); \
709 state->data_stack_base += size; \
710} while (0)
711
712#define DATA_STACK_POP(state, data, size, discard) \
713do { \
714 TRACE(("copy data to %p from %d (%d)\n", \
715 data, state->data_stack_base-size, size)); \
716 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
717 if (discard) \
718 state->data_stack_base -= size; \
719} while (0)
720
721#define DATA_STACK_POP_DISCARD(state, size) \
722do { \
723 TRACE(("discard data from %d (%d)\n", \
724 state->data_stack_base-size, size)); \
725 state->data_stack_base -= size; \
726} while(0)
727
728#define DATA_PUSH(x) \
729 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
730#define DATA_POP(x) \
731 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000732#define DATA_POP_DISCARD(x) \
733 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
734#define DATA_ALLOC(t,p) \
735 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000736#define DATA_LOOKUP_AT(t,p,pos) \
737 DATA_STACK_LOOKUP_AT(state,t,p,pos)
738
739#define MARK_PUSH(lastmark) \
740 do if (lastmark > 0) { \
741 i = lastmark; /* ctx->lastmark may change if reallocated */ \
742 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
743 } while (0)
744#define MARK_POP(lastmark) \
745 do if (lastmark > 0) { \
746 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
747 } while (0)
748#define MARK_POP_KEEP(lastmark) \
749 do if (lastmark > 0) { \
750 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
751 } while (0)
752#define MARK_POP_DISCARD(lastmark) \
753 do if (lastmark > 0) { \
754 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
755 } while (0)
756
757#define JUMP_NONE 0
758#define JUMP_MAX_UNTIL_1 1
759#define JUMP_MAX_UNTIL_2 2
760#define JUMP_MAX_UNTIL_3 3
761#define JUMP_MIN_UNTIL_1 4
762#define JUMP_MIN_UNTIL_2 5
763#define JUMP_MIN_UNTIL_3 6
764#define JUMP_REPEAT 7
765#define JUMP_REPEAT_ONE_1 8
766#define JUMP_REPEAT_ONE_2 9
767#define JUMP_MIN_REPEAT_ONE 10
768#define JUMP_BRANCH 11
769#define JUMP_ASSERT 12
770#define JUMP_ASSERT_NOT 13
771
772#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
773 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
774 nextctx->last_ctx_pos = ctx_pos; \
775 nextctx->jump = jumpvalue; \
776 nextctx->pattern = nextpattern; \
777 ctx_pos = alloc_pos; \
778 ctx = nextctx; \
779 goto entrance; \
780 jumplabel: \
781 while (0) /* gcc doesn't like labels at end of scopes */ \
782
783typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000784 Py_ssize_t last_ctx_pos;
785 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000786 SRE_CHAR* ptr;
787 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000788 Py_ssize_t count;
789 Py_ssize_t lastmark;
790 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000791 union {
792 SRE_CODE chr;
793 SRE_REPEAT* rep;
794 } u;
795} SRE_MATCH_CONTEXT;
796
797/* check if string matches the given pattern. returns <0 for
798 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000799LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000800SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000801{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000802 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000803 Py_ssize_t alloc_pos, ctx_pos = -1;
804 Py_ssize_t i, ret = 0;
805 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000806 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000807
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000808 SRE_MATCH_CONTEXT* ctx;
809 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000811 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000813 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
814 ctx->last_ctx_pos = -1;
815 ctx->jump = JUMP_NONE;
816 ctx->pattern = pattern;
817 ctx_pos = alloc_pos;
818
819entrance:
820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000821 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000822
823 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000824 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000825 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000826 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000828 (end - ctx->ptr), ctx->pattern[3]));
829 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000832 }
833
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000834 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000835 ++sigcount;
836 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
837 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000840
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 case SRE_OP_MARK:
842 /* set mark */
843 /* <MARK> <gid> */
844 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
845 ctx->ptr, ctx->pattern[0]));
846 i = ctx->pattern[0];
847 if (i & 1)
848 state->lastindex = i/2 + 1;
849 if (i > state->lastmark) {
850 /* state->lastmark is the highest valid index in the
851 state->mark array. If it is increased by more than 1,
852 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000853 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000854 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000855 while (j < i)
856 state->mark[j++] = NULL;
857 state->lastmark = i;
858 }
859 state->mark[i] = ctx->ptr;
860 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000862
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 case SRE_OP_LITERAL:
864 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
867 ctx->ptr, *ctx->pattern));
868 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
869 RETURN_FAILURE;
870 ctx->pattern++;
871 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000873
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 case SRE_OP_NOT_LITERAL:
875 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000876 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000877 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_SUCCESS:
886 /* end of pattern */
887 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
888 state->ptr = ctx->ptr;
889 RETURN_SUCCESS;
890
891 case SRE_OP_AT:
892 /* match at given position */
893 /* <AT> <code> */
894 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
895 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
896 RETURN_FAILURE;
897 ctx->pattern++;
898 break;
899
900 case SRE_OP_CATEGORY:
901 /* match at given category */
902 /* <CATEGORY> <code> */
903 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
904 ctx->ptr, *ctx->pattern));
905 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
906 RETURN_FAILURE;
907 ctx->pattern++;
908 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000912 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
915 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
916 RETURN_FAILURE;
917 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000918 break;
919
920 case SRE_OP_ANY_ALL:
921 /* match anything */
922 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
924 if (ctx->ptr >= end)
925 RETURN_FAILURE;
926 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 case SRE_OP_IN:
930 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000931 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000932 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
934 RETURN_FAILURE;
935 ctx->pattern += ctx->pattern[0];
936 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
941 ctx->pattern, ctx->ptr, ctx->pattern[0]));
942 if (ctx->ptr >= end ||
943 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
944 RETURN_FAILURE;
945 ctx->pattern++;
946 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
951 ctx->pattern, ctx->ptr, *ctx->pattern));
952 if (ctx->ptr >= end ||
953 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
954 RETURN_FAILURE;
955 ctx->pattern++;
956 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
961 if (ctx->ptr >= end
962 || !SRE_CHARSET(ctx->pattern+1,
963 (SRE_CODE)state->lower(*ctx->ptr)))
964 RETURN_FAILURE;
965 ctx->pattern += ctx->pattern[0];
966 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000968
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000969 case SRE_OP_JUMP:
970 case SRE_OP_INFO:
971 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000972 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
974 ctx->ptr, ctx->pattern[0]));
975 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 break;
977
978 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* alternation */
980 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000982 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 ctx->u.rep = state->repeat;
984 if (ctx->u.rep)
985 MARK_PUSH(ctx->lastmark);
986 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
987 if (ctx->pattern[1] == SRE_OP_LITERAL &&
988 (ctx->ptr >= end ||
989 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000990 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000991 if (ctx->pattern[1] == SRE_OP_IN &&
992 (ctx->ptr >= end ||
993 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000994 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 if (ret) {
998 if (ctx->u.rep)
999 MARK_POP_DISCARD(ctx->lastmark);
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001002 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 if (ctx->u.rep)
1004 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001005 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001006 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001007 if (ctx->u.rep)
1008 MARK_POP_DISCARD(ctx->lastmark);
1009 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001010
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011 case SRE_OP_REPEAT_ONE:
1012 /* match repeated sequence (maximizing regexp) */
1013
1014 /* this operator only works if the repeated item is
1015 exactly one character wide, and we're not already
1016 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001017 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
1019 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1020
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1022 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 if (ctx->ptr + ctx->pattern[1] > end)
1025 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001029 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1030 RETURN_ON_ERROR(ret);
1031 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1032 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034
1035 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 string. check if the rest of the pattern matches,
1038 and backtrack if not. */
1039
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001040 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001045 state->ptr = ctx->ptr;
1046 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001049 LASTMARK_SAVE();
1050
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 /* tail starts with a literal. skip positions where
1053 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001056 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1058 ctx->ptr--;
1059 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001061 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1065 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 if (ret) {
1067 RETURN_ON_ERROR(ret);
1068 RETURN_SUCCESS;
1069 }
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001071 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001072
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 ctx->ptr--;
1074 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 }
1076
1077 } else {
1078 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001079 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1082 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 if (ret) {
1084 RETURN_ON_ERROR(ret);
1085 RETURN_SUCCESS;
1086 }
1087 ctx->ptr--;
1088 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001089 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001090 }
1091 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 case SRE_OP_MIN_REPEAT_ONE:
1095 /* match repeated sequence (minimizing regexp) */
1096
1097 /* this operator only works if the repeated item is
1098 exactly one character wide, and we're not already
1099 collecting backtracking points. for other cases,
1100 use the MIN_REPEAT operator */
1101
1102 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1105 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 if (ctx->ptr + ctx->pattern[1] > end)
1108 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001112 if (ctx->pattern[1] == 0)
1113 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114 else {
1115 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001116 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1117 RETURN_ON_ERROR(ret);
1118 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001119 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001120 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 RETURN_FAILURE;
1122 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001125 }
1126
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131
1132 } else {
1133 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001134 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001135 while ((Py_ssize_t)ctx->pattern[2] == 65535
1136 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001137 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1143 }
1144 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001147 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 assert(ret == 1);
1151 ctx->ptr++;
1152 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001153 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001154 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001158 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001160 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1163 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001166 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001167 if (!ctx->u.rep) {
1168 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001169 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001170 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001171 ctx->u.rep->count = -1;
1172 ctx->u.rep->pattern = ctx->pattern;
1173 ctx->u.rep->prev = state->repeat;
1174 ctx->u.rep->last_ptr = NULL;
1175 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001176
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001177 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001180 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 if (ret) {
1183 RETURN_ON_ERROR(ret);
1184 RETURN_SUCCESS;
1185 }
1186 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001187
1188 case SRE_OP_MAX_UNTIL:
1189 /* maximizing repeat */
1190 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1191
1192 /* FIXME: we probably need to deal with zero-width
1193 matches in here... */
1194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 ctx->u.rep = state->repeat;
1196 if (!ctx->u.rep)
1197 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001201 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001202
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1204 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1210 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 if (ret) {
1212 RETURN_ON_ERROR(ret);
1213 RETURN_SUCCESS;
1214 }
1215 ctx->u.rep->count = ctx->count-1;
1216 state->ptr = ctx->ptr;
1217 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001218 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001219
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001220 if ((ctx->count < ctx->u.rep->pattern[2] ||
1221 ctx->u.rep->pattern[2] == 65535) &&
1222 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001223 /* we may have enough matches, but if we can
1224 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001226 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 MARK_PUSH(ctx->lastmark);
1228 /* zero-width match protection */
1229 DATA_PUSH(&ctx->u.rep->last_ptr);
1230 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1232 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001233 DATA_POP(&ctx->u.rep->last_ptr);
1234 if (ret) {
1235 MARK_POP_DISCARD(ctx->lastmark);
1236 RETURN_ON_ERROR(ret);
1237 RETURN_SUCCESS;
1238 }
1239 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001240 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 ctx->u.rep->count = ctx->count-1;
1242 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001243 }
1244
1245 /* cannot match more repeated items here. make sure the
1246 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001249 RETURN_ON_SUCCESS(ret);
1250 state->repeat = ctx->u.rep;
1251 state->ptr = ctx->ptr;
1252 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001253
1254 case SRE_OP_MIN_UNTIL:
1255 /* minimizing repeat */
1256 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 ctx->u.rep = state->repeat;
1259 if (!ctx->u.rep)
1260 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001263
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1267 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001270 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1273 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 if (ret) {
1275 RETURN_ON_ERROR(ret);
1276 RETURN_SUCCESS;
1277 }
1278 ctx->u.rep->count = ctx->count-1;
1279 state->ptr = ctx->ptr;
1280 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281 }
1282
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001283 LASTMARK_SAVE();
1284
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001285 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001286 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001288 if (ret) {
1289 RETURN_ON_ERROR(ret);
1290 RETURN_SUCCESS;
1291 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001292
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 state->repeat = ctx->u.rep;
1294 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001295
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001296 LASTMARK_RESTORE();
1297
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 if (ctx->count >= ctx->u.rep->pattern[2]
1299 && ctx->u.rep->pattern[2] != 65535)
1300 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001301
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001302 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001303 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1304 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (ret) {
1306 RETURN_ON_ERROR(ret);
1307 RETURN_SUCCESS;
1308 }
1309 ctx->u.rep->count = ctx->count-1;
1310 state->ptr = ctx->ptr;
1311 RETURN_FAILURE;
1312
1313 case SRE_OP_GROUPREF:
1314 /* match backreference */
1315 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1316 ctx->ptr, ctx->pattern[0]));
1317 i = ctx->pattern[0];
1318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001319 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001320 if (groupref >= state->lastmark) {
1321 RETURN_FAILURE;
1322 } else {
1323 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1324 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1325 if (!p || !e || e < p)
1326 RETURN_FAILURE;
1327 while (p < e) {
1328 if (ctx->ptr >= end || *ctx->ptr != *p)
1329 RETURN_FAILURE;
1330 p++; ctx->ptr++;
1331 }
1332 }
1333 }
1334 ctx->pattern++;
1335 break;
1336
1337 case SRE_OP_GROUPREF_IGNORE:
1338 /* match backreference */
1339 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1340 ctx->ptr, ctx->pattern[0]));
1341 i = ctx->pattern[0];
1342 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001343 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001344 if (groupref >= state->lastmark) {
1345 RETURN_FAILURE;
1346 } else {
1347 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1348 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1349 if (!p || !e || e < p)
1350 RETURN_FAILURE;
1351 while (p < e) {
1352 if (ctx->ptr >= end ||
1353 state->lower(*ctx->ptr) != state->lower(*p))
1354 RETURN_FAILURE;
1355 p++; ctx->ptr++;
1356 }
1357 }
1358 }
1359 ctx->pattern++;
1360 break;
1361
1362 case SRE_OP_GROUPREF_EXISTS:
1363 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1364 ctx->ptr, ctx->pattern[0]));
1365 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1366 i = ctx->pattern[0];
1367 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001368 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001369 if (groupref >= state->lastmark) {
1370 ctx->pattern += ctx->pattern[1];
1371 break;
1372 } else {
1373 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1374 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1375 if (!p || !e || e < p) {
1376 ctx->pattern += ctx->pattern[1];
1377 break;
1378 }
1379 }
1380 }
1381 ctx->pattern += 2;
1382 break;
1383
1384 case SRE_OP_ASSERT:
1385 /* assert subpattern */
1386 /* <ASSERT> <skip> <back> <pattern> */
1387 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1388 ctx->ptr, ctx->pattern[1]));
1389 state->ptr = ctx->ptr - ctx->pattern[1];
1390 if (state->ptr < state->beginning)
1391 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001392 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001393 RETURN_ON_FAILURE(ret);
1394 ctx->pattern += ctx->pattern[0];
1395 break;
1396
1397 case SRE_OP_ASSERT_NOT:
1398 /* assert not subpattern */
1399 /* <ASSERT_NOT> <skip> <back> <pattern> */
1400 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1401 ctx->ptr, ctx->pattern[1]));
1402 state->ptr = ctx->ptr - ctx->pattern[1];
1403 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001404 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001405 if (ret) {
1406 RETURN_ON_ERROR(ret);
1407 RETURN_FAILURE;
1408 }
1409 }
1410 ctx->pattern += ctx->pattern[0];
1411 break;
1412
1413 case SRE_OP_FAILURE:
1414 /* immediate failure */
1415 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1416 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001417
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001419 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1420 ctx->pattern[-1]));
1421 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001422 }
1423 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001424
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001425exit:
1426 ctx_pos = ctx->last_ctx_pos;
1427 jump = ctx->jump;
1428 DATA_POP_DISCARD(ctx);
1429 if (ctx_pos == -1)
1430 return ret;
1431 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1432
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001433 switch (jump) {
1434 case JUMP_MAX_UNTIL_2:
1435 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1436 goto jump_max_until_2;
1437 case JUMP_MAX_UNTIL_3:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_3;
1440 case JUMP_MIN_UNTIL_2:
1441 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1442 goto jump_min_until_2;
1443 case JUMP_MIN_UNTIL_3:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_3;
1446 case JUMP_BRANCH:
1447 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1448 goto jump_branch;
1449 case JUMP_MAX_UNTIL_1:
1450 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1451 goto jump_max_until_1;
1452 case JUMP_MIN_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_min_until_1;
1455 case JUMP_REPEAT:
1456 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1457 goto jump_repeat;
1458 case JUMP_REPEAT_ONE_1:
1459 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat_one_1;
1461 case JUMP_REPEAT_ONE_2:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_2;
1464 case JUMP_MIN_REPEAT_ONE:
1465 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1466 goto jump_min_repeat_one;
1467 case JUMP_ASSERT:
1468 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1469 goto jump_assert;
1470 case JUMP_ASSERT_NOT:
1471 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert_not;
1473 case JUMP_NONE:
1474 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1475 break;
1476 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001477
1478 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001479}
1480
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001481LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001482SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1483{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001484 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1485 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001486 Py_ssize_t status = 0;
1487 Py_ssize_t prefix_len = 0;
1488 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001489 SRE_CODE* prefix = NULL;
1490 SRE_CODE* charset = NULL;
1491 SRE_CODE* overlap = NULL;
1492 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001493
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001494 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001496 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001497
1498 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001500 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001501 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001503 end -= pattern[3]-1;
1504 if (end <= ptr)
1505 end = ptr+1;
1506 }
1507
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001509 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001510 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001511 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001512 prefix_skip = pattern[6];
1513 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 overlap = prefix + prefix_len - 1;
1515 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001516 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001518 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001519
1520 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001521 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001522
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001523 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1524 TRACE(("charset = %p\n", charset));
1525
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001527 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001528 /* pattern starts with a known prefix. use the overlap
1529 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001530 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001531 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001532 while (ptr < end) {
1533 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001534 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 if (!i)
1536 break;
1537 else
1538 i = overlap[i];
1539 } else {
1540 if (++i == prefix_len) {
1541 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001542 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1543 state->start = ptr + 1 - prefix_len;
1544 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001545 if (flags & SRE_INFO_LITERAL)
1546 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001547 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001548 if (status != 0)
1549 return status;
1550 /* close but no cigar -- try again */
1551 i = overlap[i];
1552 }
1553 break;
1554 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001555 }
1556 ptr++;
1557 }
1558 return 0;
1559 }
1560#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001561
Fredrik Lundh3562f112000-07-02 12:00:07 +00001562 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001564 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 for (;;) {
1568 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1569 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001570 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001572 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 state->start = ptr;
1574 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001575 if (flags & SRE_INFO_LITERAL)
1576 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001577 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 if (status != 0)
1579 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001580 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 } else if (charset) {
1582 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001583 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001587 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001589 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 state->start = ptr;
1591 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001592 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 if (status != 0)
1594 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001595 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 }
1597 } else
1598 /* general case */
1599 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001600 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001602 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 if (status != 0)
1604 break;
1605 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001606
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001607 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001608}
Tim Peters3d563502006-01-21 02:47:53 +00001609
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001610LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001611SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001612{
1613 /* check if given string is a literal template (i.e. no escapes) */
1614 while (len-- > 0)
1615 if (*ptr++ == '\\')
1616 return 0;
1617 return 1;
1618}
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001620#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001621
1622/* -------------------------------------------------------------------- */
1623/* factories and destructors */
1624
1625/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001626static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1627static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001628
1629static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001630sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001631{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001632 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001633}
1634
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001635static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001637{
1638 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001639 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001640 return NULL;
1641 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001642 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001643 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001644#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001646#else
1647 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001648#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001650}
1651
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001652LOCAL(void)
1653state_reset(SRE_STATE* state)
1654{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001658 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659 state->lastindex = -1;
1660
1661 state->repeat = NULL;
1662
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664}
1665
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001666static void*
Benjamin Petersone48944b2012-03-07 14:50:25 -06001667getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize, Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001668{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001669 /* given a python object, return a data pointer, a length (in
1670 characters), and a character size. return NULL if the object
1671 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001674 Py_ssize_t size, bytes;
1675 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001677
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001678 /* Unicode objects do not support the buffer API. So, get the data
1679 directly instead. */
1680 if (PyUnicode_Check(string)) {
1681 ptr = (void *)PyUnicode_AS_DATA(string);
1682 *p_length = PyUnicode_GET_SIZE(string);
1683 *p_charsize = sizeof(Py_UNICODE);
1684 return ptr;
1685 }
1686
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001687 /* get pointer to string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001688 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001689 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001690 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001691 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001692 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1693 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001694 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001697 bytes = view->len;
1698 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001699
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001700 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001702 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001703 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001704
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001705 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001707
Christian Heimes72b710a2008-05-26 13:28:38 +00001708 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001709 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001710#if defined(HAVE_UNICODE)
Antoine Pitroufd036452008-08-19 17:56:33 +00001711 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001712 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001713#endif
1714 else {
1715 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001716 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001717 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001718
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001719 *p_length = size;
1720 *p_charsize = charsize;
1721
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001722 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001723 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001724 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001725 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001726 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001727 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001728 err:
1729 PyBuffer_Release(view);
1730 view->buf = NULL;
1731 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001732}
1733
1734LOCAL(PyObject*)
1735state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001736 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001737{
1738 /* prepare state object */
1739
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001740 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741 int charsize;
1742 void* ptr;
1743
1744 memset(state, 0, sizeof(SRE_STATE));
1745
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001746 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001747 state->lastindex = -1;
1748
Benjamin Petersone48944b2012-03-07 14:50:25 -06001749 state->buffer.buf = NULL;
1750 ptr = getstring(string, &length, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001752 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001753
Benjamin Petersone48944b2012-03-07 14:50:25 -06001754 if (charsize == 1 && pattern->charsize > 1) {
1755 PyErr_SetString(PyExc_TypeError,
Antoine Pitroufd036452008-08-19 17:56:33 +00001756 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001757 goto err;
1758 }
1759 if (charsize > 1 && pattern->charsize == 1) {
1760 PyErr_SetString(PyExc_TypeError,
Antoine Pitroufd036452008-08-19 17:56:33 +00001761 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001762 goto err;
1763 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001764
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001765 /* adjust boundaries */
1766 if (start < 0)
1767 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001768 else if (start > length)
1769 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 if (end < 0)
1772 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001773 else if (end > length)
1774 end = length;
1775
1776 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001777
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001778 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001779
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001780 state->start = (void*) ((char*) ptr + start * state->charsize);
1781 state->end = (void*) ((char*) ptr + end * state->charsize);
1782
1783 Py_INCREF(string);
1784 state->string = string;
1785 state->pos = start;
1786 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001787
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001788 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001789 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001790 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001791#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001793#else
1794 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001795#endif
1796 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001797 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001798
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001799 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001800 err:
1801 if (state->buffer.buf)
1802 PyBuffer_Release(&state->buffer);
1803 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001804}
1805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806LOCAL(void)
1807state_fini(SRE_STATE* state)
1808{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001809 if (state->buffer.buf)
1810 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001811 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001812 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001813}
1814
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001815/* calculate offset from start of string */
1816#define STATE_OFFSET(state, member)\
1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1818
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001819LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001820state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001821{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001822 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001823
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001824 index = (index - 1) * 2;
1825
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001826 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001827 if (empty)
1828 /* want empty string */
1829 i = j = 0;
1830 else {
1831 Py_INCREF(Py_None);
1832 return Py_None;
1833 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001834 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001835 i = STATE_OFFSET(state, state->mark[index]);
1836 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001837 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001838
Fredrik Lundh58100642000-08-09 09:14:35 +00001839 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001840}
1841
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001842static void
1843pattern_error(int status)
1844{
1845 switch (status) {
1846 case SRE_ERROR_RECURSION_LIMIT:
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "maximum recursion limit exceeded"
1850 );
1851 break;
1852 case SRE_ERROR_MEMORY:
1853 PyErr_NoMemory();
1854 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001855 case SRE_ERROR_INTERRUPTED:
1856 /* An exception has already been raised, so let it fly */
1857 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001858 default:
1859 /* other error codes indicate compiler/engine bugs */
1860 PyErr_SetString(
1861 PyExc_RuntimeError,
1862 "internal error in regular expression engine"
1863 );
1864 }
1865}
1866
Guido van Rossumb700df92000-03-31 14:59:30 +00001867static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001868pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001869{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001870 if (self->weakreflist != NULL)
1871 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001872 if (self->view.buf)
1873 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001874 Py_XDECREF(self->pattern);
1875 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001876 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001877 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001878}
1879
1880static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001881pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001882{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 SRE_STATE state;
1884 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001885
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001887 Py_ssize_t start = 0;
1888 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001889 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001890 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001891 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001894 string = state_init(&state, self, string, start, end);
1895 if (!string)
1896 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001897
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 state.ptr = state.start;
1899
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001900 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1901
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001903 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001905#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001906 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001907#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001909
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001910 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001911 if (PyErr_Occurred())
1912 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001913
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001914 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001917}
1918
1919static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001920pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001921{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001922 SRE_STATE state;
1923 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001926 Py_ssize_t start = 0;
1927 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001928 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001929 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001930 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001931 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001932
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 string = state_init(&state, self, string, start, end);
1934 if (!string)
1935 return NULL;
1936
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001937 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 if (state.charsize == 1) {
1940 status = sre_search(&state, PatternObject_GetCode(self));
1941 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001942#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001943 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001944#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001946
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001947 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001949 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001950
Thomas Wouters89f507f2006-12-13 04:49:30 +00001951 if (PyErr_Occurred())
1952 return NULL;
1953
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001954 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001955}
1956
1957static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959{
1960 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001961 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001962 PyObject* func;
1963 PyObject* result;
1964
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001965 if (!args)
1966 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001967 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 if (!name)
1969 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001970 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001971 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001972 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001973 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001974 func = PyObject_GetAttrString(mod, function);
1975 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001976 if (!func)
1977 return NULL;
1978 result = PyObject_CallObject(func, args);
1979 Py_DECREF(func);
1980 Py_DECREF(args);
1981 return result;
1982}
1983
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001984#ifdef USE_BUILTIN_COPY
1985static int
1986deepcopy(PyObject** object, PyObject* memo)
1987{
1988 PyObject* copy;
1989
1990 copy = call(
1991 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001992 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001993 );
1994 if (!copy)
1995 return 0;
1996
1997 Py_DECREF(*object);
1998 *object = copy;
1999
2000 return 1; /* success */
2001}
2002#endif
2003
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002004static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002005join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002006{
2007 /* join list elements */
2008
2009 PyObject* joiner;
2010#if PY_VERSION_HEX >= 0x01060000
2011 PyObject* function;
2012 PyObject* args;
2013#endif
2014 PyObject* result;
2015
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002016 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002017 if (!joiner)
2018 return NULL;
2019
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002020 if (PyList_GET_SIZE(list) == 0) {
2021 Py_DECREF(list);
2022 return joiner;
2023 }
2024
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002025#if PY_VERSION_HEX >= 0x01060000
2026 function = PyObject_GetAttrString(joiner, "join");
2027 if (!function) {
2028 Py_DECREF(joiner);
2029 return NULL;
2030 }
2031 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002032 if (!args) {
2033 Py_DECREF(function);
2034 Py_DECREF(joiner);
2035 return NULL;
2036 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002037 PyTuple_SET_ITEM(args, 0, list);
2038 result = PyObject_CallObject(function, args);
2039 Py_DECREF(args); /* also removes list */
2040 Py_DECREF(function);
2041#else
2042 result = call(
2043 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002044 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002045 );
2046#endif
2047 Py_DECREF(joiner);
2048
2049 return result;
2050}
2051
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002052static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002053pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002054{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 SRE_STATE state;
2056 PyObject* list;
2057 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002058 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002061 Py_ssize_t start = 0;
2062 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002063 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002064 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002065 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 string = state_init(&state, self, string, start, end);
2069 if (!string)
2070 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002071
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002073 if (!list) {
2074 state_fini(&state);
2075 return NULL;
2076 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002077
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002078 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002079
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002080 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002081
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002082 state_reset(&state);
2083
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002084 state.ptr = state.start;
2085
2086 if (state.charsize == 1) {
2087 status = sre_search(&state, PatternObject_GetCode(self));
2088 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002089#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002090 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002091#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002092 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002093
Thomas Wouters89f507f2006-12-13 04:49:30 +00002094 if (PyErr_Occurred())
2095 goto error;
2096
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002097 if (status <= 0) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002098 if (status == 0)
2099 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002100 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002101 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 }
Tim Peters3d563502006-01-21 02:47:53 +00002103
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002104 /* don't bother to build a match object */
2105 switch (self->groups) {
2106 case 0:
2107 b = STATE_OFFSET(&state, state.start);
2108 e = STATE_OFFSET(&state, state.ptr);
2109 item = PySequence_GetSlice(string, b, e);
2110 if (!item)
2111 goto error;
2112 break;
2113 case 1:
2114 item = state_getslice(&state, 1, string, 1);
2115 if (!item)
2116 goto error;
2117 break;
2118 default:
2119 item = PyTuple_New(self->groups);
2120 if (!item)
2121 goto error;
2122 for (i = 0; i < self->groups; i++) {
2123 PyObject* o = state_getslice(&state, i+1, string, 1);
2124 if (!o) {
2125 Py_DECREF(item);
2126 goto error;
2127 }
2128 PyTuple_SET_ITEM(item, i, o);
2129 }
2130 break;
2131 }
2132
2133 status = PyList_Append(list, item);
2134 Py_DECREF(item);
2135 if (status < 0)
2136 goto error;
2137
2138 if (state.ptr == state.start)
2139 state.start = (void*) ((char*) state.ptr + state.charsize);
2140 else
2141 state.start = state.ptr;
2142
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002144
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002145 state_fini(&state);
2146 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002147
2148error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002149 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002150 state_fini(&state);
2151 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002152
Guido van Rossumb700df92000-03-31 14:59:30 +00002153}
2154
Fredrik Lundh703ce812001-10-24 22:16:30 +00002155#if PY_VERSION_HEX >= 0x02020000
2156static PyObject*
2157pattern_finditer(PatternObject* pattern, PyObject* args)
2158{
2159 PyObject* scanner;
2160 PyObject* search;
2161 PyObject* iterator;
2162
2163 scanner = pattern_scanner(pattern, args);
2164 if (!scanner)
2165 return NULL;
2166
2167 search = PyObject_GetAttrString(scanner, "search");
2168 Py_DECREF(scanner);
2169 if (!search)
2170 return NULL;
2171
2172 iterator = PyCallIter_New(search, Py_None);
2173 Py_DECREF(search);
2174
2175 return iterator;
2176}
2177#endif
2178
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002179static PyObject*
2180pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2181{
2182 SRE_STATE state;
2183 PyObject* list;
2184 PyObject* item;
2185 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002186 Py_ssize_t n;
2187 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002188 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002189
2190 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002191 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002192 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002193 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002194 &string, &maxsplit))
2195 return NULL;
2196
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002197 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002198 if (!string)
2199 return NULL;
2200
2201 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002202 if (!list) {
2203 state_fini(&state);
2204 return NULL;
2205 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002206
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002207 n = 0;
2208 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002209
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002210 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002211
2212 state_reset(&state);
2213
2214 state.ptr = state.start;
2215
2216 if (state.charsize == 1) {
2217 status = sre_search(&state, PatternObject_GetCode(self));
2218 } else {
2219#if defined(HAVE_UNICODE)
2220 status = sre_usearch(&state, PatternObject_GetCode(self));
2221#endif
2222 }
2223
Thomas Wouters89f507f2006-12-13 04:49:30 +00002224 if (PyErr_Occurred())
2225 goto error;
2226
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002227 if (status <= 0) {
2228 if (status == 0)
2229 break;
2230 pattern_error(status);
2231 goto error;
2232 }
Tim Peters3d563502006-01-21 02:47:53 +00002233
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002234 if (state.start == state.ptr) {
2235 if (last == state.end)
2236 break;
2237 /* skip one character */
2238 state.start = (void*) ((char*) state.ptr + state.charsize);
2239 continue;
2240 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002241
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002242 /* get segment before this match */
2243 item = PySequence_GetSlice(
2244 string, STATE_OFFSET(&state, last),
2245 STATE_OFFSET(&state, state.start)
2246 );
2247 if (!item)
2248 goto error;
2249 status = PyList_Append(list, item);
2250 Py_DECREF(item);
2251 if (status < 0)
2252 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002253
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002254 /* add groups (if any) */
2255 for (i = 0; i < self->groups; i++) {
2256 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002257 if (!item)
2258 goto error;
2259 status = PyList_Append(list, item);
2260 Py_DECREF(item);
2261 if (status < 0)
2262 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002263 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002264
2265 n = n + 1;
2266
2267 last = state.start = state.ptr;
2268
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002269 }
2270
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002271 /* get segment following last match (even if empty) */
2272 item = PySequence_GetSlice(
2273 string, STATE_OFFSET(&state, last), state.endpos
2274 );
2275 if (!item)
2276 goto error;
2277 status = PyList_Append(list, item);
2278 Py_DECREF(item);
2279 if (status < 0)
2280 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002281
2282 state_fini(&state);
2283 return list;
2284
2285error:
2286 Py_DECREF(list);
2287 state_fini(&state);
2288 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002289
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002290}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002291
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002292static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002293pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002294 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002295{
2296 SRE_STATE state;
2297 PyObject* list;
2298 PyObject* item;
2299 PyObject* filter;
2300 PyObject* args;
2301 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002302 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002303 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002304 Py_ssize_t n;
2305 Py_ssize_t i, b, e;
2306 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002307 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002308 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002309
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002310 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002311 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002312 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002313 Py_INCREF(filter);
2314 filter_is_callable = 1;
2315 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002316 /* if not callable, check if it's a literal string */
2317 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002318 view.buf = NULL;
2319 ptr = getstring(ptemplate, &n, &bint, &view);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002320 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002321 if (ptr) {
2322 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002323 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002324 } else {
2325#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002326 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002327#endif
2328 }
2329 } else {
2330 PyErr_Clear();
2331 literal = 0;
2332 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002333 if (view.buf)
2334 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002335 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002336 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002337 Py_INCREF(filter);
2338 filter_is_callable = 0;
2339 } else {
2340 /* not a literal; hand it over to the template compiler */
2341 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002342 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002343 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002344 );
2345 if (!filter)
2346 return NULL;
2347 filter_is_callable = PyCallable_Check(filter);
2348 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002349 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002350
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002351 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002352 if (!string) {
2353 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002354 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002355 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002356
2357 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002358 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002359 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002360 state_fini(&state);
2361 return NULL;
2362 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002363
2364 n = i = 0;
2365
2366 while (!count || n < count) {
2367
2368 state_reset(&state);
2369
2370 state.ptr = state.start;
2371
2372 if (state.charsize == 1) {
2373 status = sre_search(&state, PatternObject_GetCode(self));
2374 } else {
2375#if defined(HAVE_UNICODE)
2376 status = sre_usearch(&state, PatternObject_GetCode(self));
2377#endif
2378 }
2379
Thomas Wouters89f507f2006-12-13 04:49:30 +00002380 if (PyErr_Occurred())
2381 goto error;
2382
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002383 if (status <= 0) {
2384 if (status == 0)
2385 break;
2386 pattern_error(status);
2387 goto error;
2388 }
Tim Peters3d563502006-01-21 02:47:53 +00002389
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002390 b = STATE_OFFSET(&state, state.start);
2391 e = STATE_OFFSET(&state, state.ptr);
2392
2393 if (i < b) {
2394 /* get segment before this match */
2395 item = PySequence_GetSlice(string, i, b);
2396 if (!item)
2397 goto error;
2398 status = PyList_Append(list, item);
2399 Py_DECREF(item);
2400 if (status < 0)
2401 goto error;
2402
2403 } else if (i == b && i == e && n > 0)
2404 /* ignore empty match on latest position */
2405 goto next;
2406
2407 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002408 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002409 match = pattern_new_match(self, &state, 1);
2410 if (!match)
2411 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002412 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002413 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002414 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002415 goto error;
2416 }
2417 item = PyObject_CallObject(filter, args);
2418 Py_DECREF(args);
2419 Py_DECREF(match);
2420 if (!item)
2421 goto error;
2422 } else {
2423 /* filter is literal string */
2424 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002425 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002426 }
2427
2428 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002429 if (item != Py_None) {
2430 status = PyList_Append(list, item);
2431 Py_DECREF(item);
2432 if (status < 0)
2433 goto error;
2434 }
Tim Peters3d563502006-01-21 02:47:53 +00002435
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002436 i = e;
2437 n = n + 1;
2438
2439next:
2440 /* move on */
2441 if (state.ptr == state.start)
2442 state.start = (void*) ((char*) state.ptr + state.charsize);
2443 else
2444 state.start = state.ptr;
2445
2446 }
2447
2448 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002449 if (i < state.endpos) {
2450 item = PySequence_GetSlice(string, i, state.endpos);
2451 if (!item)
2452 goto error;
2453 status = PyList_Append(list, item);
2454 Py_DECREF(item);
2455 if (status < 0)
2456 goto error;
2457 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002458
2459 state_fini(&state);
2460
Guido van Rossum4e173842001-12-07 04:25:10 +00002461 Py_DECREF(filter);
2462
Fredrik Lundhdac58492001-10-21 21:48:30 +00002463 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002464 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002465
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 if (!item)
2467 return NULL;
2468
2469 if (subn)
2470 return Py_BuildValue("Ni", item, n);
2471
2472 return item;
2473
2474error:
2475 Py_DECREF(list);
2476 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002477 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002478 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002479
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002480}
2481
2482static PyObject*
2483pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2484{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002485 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002486 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002487 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002488 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002489 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002491 return NULL;
2492
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002493 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002494}
2495
2496static PyObject*
2497pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2498{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002499 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002500 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002501 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002502 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002503 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002504 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002505 return NULL;
2506
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002507 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002508}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002509
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002510static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002511pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002512{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002513#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002514 PatternObject* copy;
2515 int offset;
2516
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002517 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2518 if (!copy)
2519 return NULL;
2520
2521 offset = offsetof(PatternObject, groups);
2522
2523 Py_XINCREF(self->groupindex);
2524 Py_XINCREF(self->indexgroup);
2525 Py_XINCREF(self->pattern);
2526
2527 memcpy((char*) copy + offset, (char*) self + offset,
2528 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002529 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002530
2531 return (PyObject*) copy;
2532#else
2533 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2534 return NULL;
2535#endif
2536}
2537
2538static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002539pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002540{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002541#ifdef USE_BUILTIN_COPY
2542 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002543
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002544 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002545 if (!copy)
2546 return NULL;
2547
2548 if (!deepcopy(&copy->groupindex, memo) ||
2549 !deepcopy(&copy->indexgroup, memo) ||
2550 !deepcopy(&copy->pattern, memo)) {
2551 Py_DECREF(copy);
2552 return NULL;
2553 }
2554
2555#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002556 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2557 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002558#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002559}
2560
Raymond Hettinger94478742004-09-24 04:31:19 +00002561PyDoc_STRVAR(pattern_match_doc,
2562"match(string[, pos[, endpos]]) --> match object or None.\n\
2563 Matches zero or more characters at the beginning of the string");
2564
2565PyDoc_STRVAR(pattern_search_doc,
2566"search(string[, pos[, endpos]]) --> match object or None.\n\
2567 Scan through string looking for a match, and return a corresponding\n\
2568 MatchObject instance. Return None if no position in the string matches.");
2569
2570PyDoc_STRVAR(pattern_split_doc,
2571"split(string[, maxsplit = 0]) --> list.\n\
2572 Split string by the occurrences of pattern.");
2573
2574PyDoc_STRVAR(pattern_findall_doc,
2575"findall(string[, pos[, endpos]]) --> list.\n\
2576 Return a list of all non-overlapping matches of pattern in string.");
2577
2578PyDoc_STRVAR(pattern_finditer_doc,
2579"finditer(string[, pos[, endpos]]) --> iterator.\n\
2580 Return an iterator over all non-overlapping matches for the \n\
2581 RE pattern in string. For each match, the iterator returns a\n\
2582 match object.");
2583
2584PyDoc_STRVAR(pattern_sub_doc,
2585"sub(repl, string[, count = 0]) --> newstring\n\
2586 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002587 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002588
2589PyDoc_STRVAR(pattern_subn_doc,
2590"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2591 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2592 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002593 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002594
2595PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2596
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002597static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002598 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002599 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002600 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002601 pattern_search_doc},
2602 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2603 pattern_sub_doc},
2604 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2605 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002606 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002607 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002608 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002609 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002610#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002611 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2612 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002613#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002614 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002615 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2616 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002617 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002618};
2619
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002620#define PAT_OFF(x) offsetof(PatternObject, x)
2621static PyMemberDef pattern_members[] = {
2622 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2623 {"flags", T_INT, PAT_OFF(flags), READONLY},
2624 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2625 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2626 {NULL} /* Sentinel */
2627};
Guido van Rossumb700df92000-03-31 14:59:30 +00002628
Neal Norwitz57c179c2006-03-22 07:18:02 +00002629static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002630 PyVarObject_HEAD_INIT(NULL, 0)
2631 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002632 sizeof(PatternObject), sizeof(SRE_CODE),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002633 (destructor)pattern_dealloc, /* tp_dealloc */
2634 0, /* tp_print */
2635 0, /* tp_getattr */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002636 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00002637 0, /* tp_reserved */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002638 0, /* tp_repr */
2639 0, /* tp_as_number */
2640 0, /* tp_as_sequence */
2641 0, /* tp_as_mapping */
2642 0, /* tp_hash */
2643 0, /* tp_call */
2644 0, /* tp_str */
2645 0, /* tp_getattro */
2646 0, /* tp_setattro */
2647 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002648 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002649 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002650 0, /* tp_traverse */
2651 0, /* tp_clear */
2652 0, /* tp_richcompare */
2653 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002654 0, /* tp_iter */
2655 0, /* tp_iternext */
2656 pattern_methods, /* tp_methods */
2657 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002658};
2659
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002660static int _validate(PatternObject *self); /* Forward */
2661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002662static PyObject *
2663_compile(PyObject* self_, PyObject* args)
2664{
2665 /* "compile" pattern descriptor to pattern object */
2666
2667 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002668 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002669
2670 PyObject* pattern;
2671 int flags = 0;
2672 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002673 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002674 PyObject* groupindex = NULL;
2675 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002676
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002677 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002678 &PyList_Type, &code, &groups,
2679 &groupindex, &indexgroup))
2680 return NULL;
2681
2682 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002683 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002684 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2685 if (!self)
2686 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002687 self->weakreflist = NULL;
2688 self->pattern = NULL;
2689 self->groupindex = NULL;
2690 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002691 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002692
2693 self->codesize = n;
2694
2695 for (i = 0; i < n; i++) {
2696 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002697 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002698 self->code[i] = (SRE_CODE) value;
2699 if ((unsigned long) self->code[i] != value) {
2700 PyErr_SetString(PyExc_OverflowError,
2701 "regular expression code size limit exceeded");
2702 break;
2703 }
2704 }
2705
2706 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002707 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002708 return NULL;
2709 }
2710
Benjamin Petersone48944b2012-03-07 14:50:25 -06002711 if (pattern == Py_None)
2712 self->charsize = -1;
2713 else {
2714 Py_ssize_t p_length;
2715 if (!getstring(pattern, &p_length, &self->charsize, &self->view)) {
2716 Py_DECREF(self);
2717 return NULL;
2718 }
2719 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002720
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002721 Py_INCREF(pattern);
2722 self->pattern = pattern;
2723
2724 self->flags = flags;
2725
2726 self->groups = groups;
2727
2728 Py_XINCREF(groupindex);
2729 self->groupindex = groupindex;
2730
2731 Py_XINCREF(indexgroup);
2732 self->indexgroup = indexgroup;
2733
2734 self->weakreflist = NULL;
2735
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002736 if (!_validate(self)) {
2737 Py_DECREF(self);
2738 return NULL;
2739 }
2740
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002741 return (PyObject*) self;
2742}
2743
Guido van Rossumb700df92000-03-31 14:59:30 +00002744/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002745/* Code validation */
2746
2747/* To learn more about this code, have a look at the _compile() function in
2748 Lib/sre_compile.py. The validation functions below checks the code array
2749 for conformance with the code patterns generated there.
2750
2751 The nice thing about the generated code is that it is position-independent:
2752 all jumps are relative jumps forward. Also, jumps don't cross each other:
2753 the target of a later jump is always earlier than the target of an earlier
2754 jump. IOW, this is okay:
2755
2756 J---------J-------T--------T
2757 \ \_____/ /
2758 \______________________/
2759
2760 but this is not:
2761
2762 J---------J-------T--------T
2763 \_________\_____/ /
2764 \____________/
2765
2766 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2767 bytes wide (the latter if Python is compiled for "wide" unicode support).
2768*/
2769
2770/* Defining this one enables tracing of the validator */
2771#undef VVERBOSE
2772
2773/* Trace macro for the validator */
2774#if defined(VVERBOSE)
2775#define VTRACE(v) printf v
2776#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002777#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002778#endif
2779
2780/* Report failure */
2781#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2782
2783/* Extract opcode, argument, or skip count from code array */
2784#define GET_OP \
2785 do { \
2786 VTRACE(("%p: ", code)); \
2787 if (code >= end) FAIL; \
2788 op = *code++; \
2789 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2790 } while (0)
2791#define GET_ARG \
2792 do { \
2793 VTRACE(("%p= ", code)); \
2794 if (code >= end) FAIL; \
2795 arg = *code++; \
2796 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2797 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002798#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002799 do { \
2800 VTRACE(("%p= ", code)); \
2801 if (code >= end) FAIL; \
2802 skip = *code; \
2803 VTRACE(("%lu (skip to %p)\n", \
2804 (unsigned long)skip, code+skip)); \
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002805 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002806 FAIL; \
2807 code++; \
2808 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002809#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002810
2811static int
2812_validate_charset(SRE_CODE *code, SRE_CODE *end)
2813{
2814 /* Some variables are manipulated by the macros above */
2815 SRE_CODE op;
2816 SRE_CODE arg;
2817 SRE_CODE offset;
2818 int i;
2819
2820 while (code < end) {
2821 GET_OP;
2822 switch (op) {
2823
2824 case SRE_OP_NEGATE:
2825 break;
2826
2827 case SRE_OP_LITERAL:
2828 GET_ARG;
2829 break;
2830
2831 case SRE_OP_RANGE:
2832 GET_ARG;
2833 GET_ARG;
2834 break;
2835
2836 case SRE_OP_CHARSET:
2837 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2838 if (code+offset < code || code+offset > end)
2839 FAIL;
2840 code += offset;
2841 break;
2842
2843 case SRE_OP_BIGCHARSET:
2844 GET_ARG; /* Number of blocks */
2845 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2846 if (code+offset < code || code+offset > end)
2847 FAIL;
2848 /* Make sure that each byte points to a valid block */
2849 for (i = 0; i < 256; i++) {
2850 if (((unsigned char *)code)[i] >= arg)
2851 FAIL;
2852 }
2853 code += offset;
2854 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2855 if (code+offset < code || code+offset > end)
2856 FAIL;
2857 code += offset;
2858 break;
2859
2860 case SRE_OP_CATEGORY:
2861 GET_ARG;
2862 switch (arg) {
2863 case SRE_CATEGORY_DIGIT:
2864 case SRE_CATEGORY_NOT_DIGIT:
2865 case SRE_CATEGORY_SPACE:
2866 case SRE_CATEGORY_NOT_SPACE:
2867 case SRE_CATEGORY_WORD:
2868 case SRE_CATEGORY_NOT_WORD:
2869 case SRE_CATEGORY_LINEBREAK:
2870 case SRE_CATEGORY_NOT_LINEBREAK:
2871 case SRE_CATEGORY_LOC_WORD:
2872 case SRE_CATEGORY_LOC_NOT_WORD:
2873 case SRE_CATEGORY_UNI_DIGIT:
2874 case SRE_CATEGORY_UNI_NOT_DIGIT:
2875 case SRE_CATEGORY_UNI_SPACE:
2876 case SRE_CATEGORY_UNI_NOT_SPACE:
2877 case SRE_CATEGORY_UNI_WORD:
2878 case SRE_CATEGORY_UNI_NOT_WORD:
2879 case SRE_CATEGORY_UNI_LINEBREAK:
2880 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2881 break;
2882 default:
2883 FAIL;
2884 }
2885 break;
2886
2887 default:
2888 FAIL;
2889
2890 }
2891 }
2892
2893 return 1;
2894}
2895
2896static int
2897_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2898{
2899 /* Some variables are manipulated by the macros above */
2900 SRE_CODE op;
2901 SRE_CODE arg;
2902 SRE_CODE skip;
2903
2904 VTRACE(("code=%p, end=%p\n", code, end));
2905
2906 if (code > end)
2907 FAIL;
2908
2909 while (code < end) {
2910 GET_OP;
2911 switch (op) {
2912
2913 case SRE_OP_MARK:
2914 /* We don't check whether marks are properly nested; the
2915 sre_match() code is robust even if they don't, and the worst
2916 you can get is nonsensical match results. */
2917 GET_ARG;
2918 if (arg > 2*groups+1) {
2919 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2920 FAIL;
2921 }
2922 break;
2923
2924 case SRE_OP_LITERAL:
2925 case SRE_OP_NOT_LITERAL:
2926 case SRE_OP_LITERAL_IGNORE:
2927 case SRE_OP_NOT_LITERAL_IGNORE:
2928 GET_ARG;
2929 /* The arg is just a character, nothing to check */
2930 break;
2931
2932 case SRE_OP_SUCCESS:
2933 case SRE_OP_FAILURE:
2934 /* Nothing to check; these normally end the matching process */
2935 break;
2936
2937 case SRE_OP_AT:
2938 GET_ARG;
2939 switch (arg) {
2940 case SRE_AT_BEGINNING:
2941 case SRE_AT_BEGINNING_STRING:
2942 case SRE_AT_BEGINNING_LINE:
2943 case SRE_AT_END:
2944 case SRE_AT_END_LINE:
2945 case SRE_AT_END_STRING:
2946 case SRE_AT_BOUNDARY:
2947 case SRE_AT_NON_BOUNDARY:
2948 case SRE_AT_LOC_BOUNDARY:
2949 case SRE_AT_LOC_NON_BOUNDARY:
2950 case SRE_AT_UNI_BOUNDARY:
2951 case SRE_AT_UNI_NON_BOUNDARY:
2952 break;
2953 default:
2954 FAIL;
2955 }
2956 break;
2957
2958 case SRE_OP_ANY:
2959 case SRE_OP_ANY_ALL:
2960 /* These have no operands */
2961 break;
2962
2963 case SRE_OP_IN:
2964 case SRE_OP_IN_IGNORE:
2965 GET_SKIP;
2966 /* Stop 1 before the end; we check the FAILURE below */
2967 if (!_validate_charset(code, code+skip-2))
2968 FAIL;
2969 if (code[skip-2] != SRE_OP_FAILURE)
2970 FAIL;
2971 code += skip-1;
2972 break;
2973
2974 case SRE_OP_INFO:
2975 {
2976 /* A minimal info field is
2977 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2978 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2979 more follows. */
2980 SRE_CODE flags, min, max, i;
2981 SRE_CODE *newcode;
2982 GET_SKIP;
2983 newcode = code+skip-1;
2984 GET_ARG; flags = arg;
2985 GET_ARG; min = arg;
2986 GET_ARG; max = arg;
2987 /* Check that only valid flags are present */
2988 if ((flags & ~(SRE_INFO_PREFIX |
2989 SRE_INFO_LITERAL |
2990 SRE_INFO_CHARSET)) != 0)
2991 FAIL;
2992 /* PREFIX and CHARSET are mutually exclusive */
2993 if ((flags & SRE_INFO_PREFIX) &&
2994 (flags & SRE_INFO_CHARSET))
2995 FAIL;
2996 /* LITERAL implies PREFIX */
2997 if ((flags & SRE_INFO_LITERAL) &&
2998 !(flags & SRE_INFO_PREFIX))
2999 FAIL;
3000 /* Validate the prefix */
3001 if (flags & SRE_INFO_PREFIX) {
3002 SRE_CODE prefix_len, prefix_skip;
3003 GET_ARG; prefix_len = arg;
3004 GET_ARG; prefix_skip = arg;
3005 /* Here comes the prefix string */
3006 if (code+prefix_len < code || code+prefix_len > newcode)
3007 FAIL;
3008 code += prefix_len;
3009 /* And here comes the overlap table */
3010 if (code+prefix_len < code || code+prefix_len > newcode)
3011 FAIL;
3012 /* Each overlap value should be < prefix_len */
3013 for (i = 0; i < prefix_len; i++) {
3014 if (code[i] >= prefix_len)
3015 FAIL;
3016 }
3017 code += prefix_len;
3018 }
3019 /* Validate the charset */
3020 if (flags & SRE_INFO_CHARSET) {
3021 if (!_validate_charset(code, newcode-1))
3022 FAIL;
3023 if (newcode[-1] != SRE_OP_FAILURE)
3024 FAIL;
3025 code = newcode;
3026 }
3027 else if (code != newcode) {
3028 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3029 FAIL;
3030 }
3031 }
3032 break;
3033
3034 case SRE_OP_BRANCH:
3035 {
3036 SRE_CODE *target = NULL;
3037 for (;;) {
3038 GET_SKIP;
3039 if (skip == 0)
3040 break;
3041 /* Stop 2 before the end; we check the JUMP below */
3042 if (!_validate_inner(code, code+skip-3, groups))
3043 FAIL;
3044 code += skip-3;
3045 /* Check that it ends with a JUMP, and that each JUMP
3046 has the same target */
3047 GET_OP;
3048 if (op != SRE_OP_JUMP)
3049 FAIL;
3050 GET_SKIP;
3051 if (target == NULL)
3052 target = code+skip-1;
3053 else if (code+skip-1 != target)
3054 FAIL;
3055 }
3056 }
3057 break;
3058
3059 case SRE_OP_REPEAT_ONE:
3060 case SRE_OP_MIN_REPEAT_ONE:
3061 {
3062 SRE_CODE min, max;
3063 GET_SKIP;
3064 GET_ARG; min = arg;
3065 GET_ARG; max = arg;
3066 if (min > max)
3067 FAIL;
3068#ifdef Py_UNICODE_WIDE
3069 if (max > 65535)
3070 FAIL;
3071#endif
3072 if (!_validate_inner(code, code+skip-4, groups))
3073 FAIL;
3074 code += skip-4;
3075 GET_OP;
3076 if (op != SRE_OP_SUCCESS)
3077 FAIL;
3078 }
3079 break;
3080
3081 case SRE_OP_REPEAT:
3082 {
3083 SRE_CODE min, max;
3084 GET_SKIP;
3085 GET_ARG; min = arg;
3086 GET_ARG; max = arg;
3087 if (min > max)
3088 FAIL;
3089#ifdef Py_UNICODE_WIDE
3090 if (max > 65535)
3091 FAIL;
3092#endif
3093 if (!_validate_inner(code, code+skip-3, groups))
3094 FAIL;
3095 code += skip-3;
3096 GET_OP;
3097 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3098 FAIL;
3099 }
3100 break;
3101
3102 case SRE_OP_GROUPREF:
3103 case SRE_OP_GROUPREF_IGNORE:
3104 GET_ARG;
3105 if (arg >= groups)
3106 FAIL;
3107 break;
3108
3109 case SRE_OP_GROUPREF_EXISTS:
3110 /* The regex syntax for this is: '(?(group)then|else)', where
3111 'group' is either an integer group number or a group name,
3112 'then' and 'else' are sub-regexes, and 'else' is optional. */
3113 GET_ARG;
3114 if (arg >= groups)
3115 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003116 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003117 code--; /* The skip is relative to the first arg! */
3118 /* There are two possibilities here: if there is both a 'then'
3119 part and an 'else' part, the generated code looks like:
3120
3121 GROUPREF_EXISTS
3122 <group>
3123 <skipyes>
3124 ...then part...
3125 JUMP
3126 <skipno>
3127 (<skipyes> jumps here)
3128 ...else part...
3129 (<skipno> jumps here)
3130
3131 If there is only a 'then' part, it looks like:
3132
3133 GROUPREF_EXISTS
3134 <group>
3135 <skip>
3136 ...then part...
3137 (<skip> jumps here)
3138
3139 There is no direct way to decide which it is, and we don't want
3140 to allow arbitrary jumps anywhere in the code; so we just look
3141 for a JUMP opcode preceding our skip target.
3142 */
3143 if (skip >= 3 && code+skip-3 >= code &&
3144 code[skip-3] == SRE_OP_JUMP)
3145 {
3146 VTRACE(("both then and else parts present\n"));
3147 if (!_validate_inner(code+1, code+skip-3, groups))
3148 FAIL;
3149 code += skip-2; /* Position after JUMP, at <skipno> */
3150 GET_SKIP;
3151 if (!_validate_inner(code, code+skip-1, groups))
3152 FAIL;
3153 code += skip-1;
3154 }
3155 else {
3156 VTRACE(("only a then part present\n"));
3157 if (!_validate_inner(code+1, code+skip-1, groups))
3158 FAIL;
3159 code += skip-1;
3160 }
3161 break;
3162
3163 case SRE_OP_ASSERT:
3164 case SRE_OP_ASSERT_NOT:
3165 GET_SKIP;
3166 GET_ARG; /* 0 for lookahead, width for lookbehind */
3167 code--; /* Back up over arg to simplify math below */
3168 if (arg & 0x80000000)
3169 FAIL; /* Width too large */
3170 /* Stop 1 before the end; we check the SUCCESS below */
3171 if (!_validate_inner(code+1, code+skip-2, groups))
3172 FAIL;
3173 code += skip-2;
3174 GET_OP;
3175 if (op != SRE_OP_SUCCESS)
3176 FAIL;
3177 break;
3178
3179 default:
3180 FAIL;
3181
3182 }
3183 }
3184
3185 VTRACE(("okay\n"));
3186 return 1;
3187}
3188
3189static int
3190_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3191{
3192 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3193 FAIL;
3194 if (groups == 0) /* fix for simplejson */
3195 groups = 100; /* 100 groups should always be safe */
3196 return _validate_inner(code, end-1, groups);
3197}
3198
3199static int
3200_validate(PatternObject *self)
3201{
3202 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3203 {
3204 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3205 return 0;
3206 }
3207 else
3208 VTRACE(("Success!\n"));
3209 return 1;
3210}
3211
3212/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003213/* match methods */
3214
3215static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003216match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003217{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003218 Py_XDECREF(self->regs);
3219 Py_XDECREF(self->string);
3220 Py_DECREF(self->pattern);
3221 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003222}
3223
3224static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003225match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003226{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003227 if (index < 0 || index >= self->groups) {
3228 /* raise IndexError if we were given a bad group number */
3229 PyErr_SetString(
3230 PyExc_IndexError,
3231 "no such group"
3232 );
3233 return NULL;
3234 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003235
Fredrik Lundh6f013982000-07-03 18:44:21 +00003236 index *= 2;
3237
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003238 if (self->string == Py_None || self->mark[index] < 0) {
3239 /* return default value if the string or group is undefined */
3240 Py_INCREF(def);
3241 return def;
3242 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003243
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 return PySequence_GetSlice(
3245 self->string, self->mark[index], self->mark[index+1]
3246 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003247}
3248
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003249static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003250match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003251{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003252 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003253
Guido van Rossumddefaf32007-01-14 03:31:43 +00003254 if (index == NULL)
3255 /* Default value */
3256 return 0;
3257
Christian Heimes217cfd12007-12-02 14:31:20 +00003258 if (PyLong_Check(index))
3259 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003260
Fredrik Lundh6f013982000-07-03 18:44:21 +00003261 i = -1;
3262
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003263 if (self->pattern->groupindex) {
3264 index = PyObject_GetItem(self->pattern->groupindex, index);
3265 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003266 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003267 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003268 Py_DECREF(index);
3269 } else
3270 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003271 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003272
3273 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003274}
3275
3276static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003277match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003278{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003279 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003280}
3281
3282static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003283match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003284{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003285 /* delegate to Python code */
3286 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003287 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003288 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003289 );
3290}
3291
3292static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003293match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003294{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003295 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003296 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003297
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003299
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003300 switch (size) {
3301 case 0:
3302 result = match_getslice(self, Py_False, Py_None);
3303 break;
3304 case 1:
3305 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3306 break;
3307 default:
3308 /* fetch multiple items */
3309 result = PyTuple_New(size);
3310 if (!result)
3311 return NULL;
3312 for (i = 0; i < size; i++) {
3313 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003314 self, PyTuple_GET_ITEM(args, i), Py_None
3315 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003316 if (!item) {
3317 Py_DECREF(result);
3318 return NULL;
3319 }
3320 PyTuple_SET_ITEM(result, i, item);
3321 }
3322 break;
3323 }
3324 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003325}
3326
3327static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003328match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003329{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003330 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003331 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003332
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003333 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003334 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003335 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003336 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 result = PyTuple_New(self->groups-1);
3339 if (!result)
3340 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003342 for (index = 1; index < self->groups; index++) {
3343 PyObject* item;
3344 item = match_getslice_by_index(self, index, def);
3345 if (!item) {
3346 Py_DECREF(result);
3347 return NULL;
3348 }
3349 PyTuple_SET_ITEM(result, index-1, item);
3350 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003351
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003353}
3354
3355static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003356match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003357{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003358 PyObject* result;
3359 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003360 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003361
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003362 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003363 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003364 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003366
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003367 result = PyDict_New();
3368 if (!result || !self->pattern->groupindex)
3369 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003370
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003371 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003372 if (!keys)
3373 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003374
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003375 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003376 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003378 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003379 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003380 if (!key)
3381 goto failed;
3382 value = match_getslice(self, key, def);
3383 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003384 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003385 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003386 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003387 status = PyDict_SetItem(result, key, value);
3388 Py_DECREF(value);
3389 if (status < 0)
3390 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003391 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003392
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003393 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003395 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003396
3397failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003398 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003399 Py_DECREF(result);
3400 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003401}
3402
3403static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003404match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003405{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003406 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003407
Guido van Rossumddefaf32007-01-14 03:31:43 +00003408 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003409 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003410 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003411
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003412 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003414 if (index < 0 || index >= self->groups) {
3415 PyErr_SetString(
3416 PyExc_IndexError,
3417 "no such group"
3418 );
3419 return NULL;
3420 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003421
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003422 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003423 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003424}
3425
3426static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003427match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003428{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003429 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003430
Guido van Rossumddefaf32007-01-14 03:31:43 +00003431 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003432 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003433 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003434
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003435 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003436
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003437 if (index < 0 || index >= self->groups) {
3438 PyErr_SetString(
3439 PyExc_IndexError,
3440 "no such group"
3441 );
3442 return NULL;
3443 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003444
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003445 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003446 return Py_BuildValue("i", self->mark[index*2+1]);
3447}
3448
3449LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003450_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003451{
3452 PyObject* pair;
3453 PyObject* item;
3454
3455 pair = PyTuple_New(2);
3456 if (!pair)
3457 return NULL;
3458
Christian Heimes217cfd12007-12-02 14:31:20 +00003459 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003460 if (!item)
3461 goto error;
3462 PyTuple_SET_ITEM(pair, 0, item);
3463
Christian Heimes217cfd12007-12-02 14:31:20 +00003464 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003465 if (!item)
3466 goto error;
3467 PyTuple_SET_ITEM(pair, 1, item);
3468
3469 return pair;
3470
3471 error:
3472 Py_DECREF(pair);
3473 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003474}
3475
3476static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003477match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003478{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003479 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003480
Guido van Rossumddefaf32007-01-14 03:31:43 +00003481 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003482 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003483 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003484
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003485 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003486
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003487 if (index < 0 || index >= self->groups) {
3488 PyErr_SetString(
3489 PyExc_IndexError,
3490 "no such group"
3491 );
3492 return NULL;
3493 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003494
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003495 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003496 return _pair(self->mark[index*2], self->mark[index*2+1]);
3497}
3498
3499static PyObject*
3500match_regs(MatchObject* self)
3501{
3502 PyObject* regs;
3503 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003504 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003505
3506 regs = PyTuple_New(self->groups);
3507 if (!regs)
3508 return NULL;
3509
3510 for (index = 0; index < self->groups; index++) {
3511 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3512 if (!item) {
3513 Py_DECREF(regs);
3514 return NULL;
3515 }
3516 PyTuple_SET_ITEM(regs, index, item);
3517 }
3518
3519 Py_INCREF(regs);
3520 self->regs = regs;
3521
3522 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003523}
3524
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003525static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003526match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003527{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003528#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003529 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003530 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003531
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003532 slots = 2 * (self->pattern->groups+1);
3533
3534 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3535 if (!copy)
3536 return NULL;
3537
3538 /* this value a constant, but any compiler should be able to
3539 figure that out all by itself */
3540 offset = offsetof(MatchObject, string);
3541
3542 Py_XINCREF(self->pattern);
3543 Py_XINCREF(self->string);
3544 Py_XINCREF(self->regs);
3545
3546 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003547 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003548
3549 return (PyObject*) copy;
3550#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003551 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003552 return NULL;
3553#endif
3554}
3555
3556static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003557match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003558{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003559#ifdef USE_BUILTIN_COPY
3560 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003561
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003562 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003563 if (!copy)
3564 return NULL;
3565
3566 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3567 !deepcopy(&copy->string, memo) ||
3568 !deepcopy(&copy->regs, memo)) {
3569 Py_DECREF(copy);
3570 return NULL;
3571 }
3572
3573#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003574 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3575 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003576#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003577}
3578
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003579static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003580 {"group", (PyCFunction) match_group, METH_VARARGS},
3581 {"start", (PyCFunction) match_start, METH_VARARGS},
3582 {"end", (PyCFunction) match_end, METH_VARARGS},
3583 {"span", (PyCFunction) match_span, METH_VARARGS},
3584 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3585 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003586 {"expand", (PyCFunction) match_expand, METH_O},
3587 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3588 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003589 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003590};
3591
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003592static PyObject *
3593match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003594{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003595 if (self->lastindex >= 0)
3596 return Py_BuildValue("i", self->lastindex);
3597 Py_INCREF(Py_None);
3598 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003599}
3600
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003601static PyObject *
3602match_lastgroup_get(MatchObject *self)
3603{
3604 if (self->pattern->indexgroup && self->lastindex >= 0) {
3605 PyObject* result = PySequence_GetItem(
3606 self->pattern->indexgroup, self->lastindex
3607 );
3608 if (result)
3609 return result;
3610 PyErr_Clear();
3611 }
3612 Py_INCREF(Py_None);
3613 return Py_None;
3614}
3615
3616static PyObject *
3617match_regs_get(MatchObject *self)
3618{
3619 if (self->regs) {
3620 Py_INCREF(self->regs);
3621 return self->regs;
3622 } else
3623 return match_regs(self);
3624}
3625
3626static PyGetSetDef match_getset[] = {
3627 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3628 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3629 {"regs", (getter)match_regs_get, (setter)NULL},
3630 {NULL}
3631};
3632
3633#define MATCH_OFF(x) offsetof(MatchObject, x)
3634static PyMemberDef match_members[] = {
3635 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3636 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3637 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3638 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3639 {NULL}
3640};
3641
Guido van Rossumb700df92000-03-31 14:59:30 +00003642/* FIXME: implement setattr("string", None) as a special case (to
3643 detach the associated string, if any */
3644
Neal Norwitz57c179c2006-03-22 07:18:02 +00003645static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003646 PyVarObject_HEAD_INIT(NULL,0)
3647 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003648 sizeof(MatchObject), sizeof(Py_ssize_t),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003649 (destructor)match_dealloc, /* tp_dealloc */
3650 0, /* tp_print */
3651 0, /* tp_getattr */
3652 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003653 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003654 0, /* tp_repr */
3655 0, /* tp_as_number */
3656 0, /* tp_as_sequence */
3657 0, /* tp_as_mapping */
3658 0, /* tp_hash */
3659 0, /* tp_call */
3660 0, /* tp_str */
3661 0, /* tp_getattro */
3662 0, /* tp_setattro */
3663 0, /* tp_as_buffer */
3664 Py_TPFLAGS_DEFAULT, /* tp_flags */
3665 0, /* tp_doc */
3666 0, /* tp_traverse */
3667 0, /* tp_clear */
3668 0, /* tp_richcompare */
3669 0, /* tp_weaklistoffset */
3670 0, /* tp_iter */
3671 0, /* tp_iternext */
3672 match_methods, /* tp_methods */
3673 match_members, /* tp_members */
3674 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003675};
3676
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003677static PyObject*
3678pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3679{
3680 /* create match object (from state object) */
3681
3682 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003683 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003684 char* base;
3685 int n;
3686
3687 if (status > 0) {
3688
3689 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003690 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003691 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3692 2*(pattern->groups+1));
3693 if (!match)
3694 return NULL;
3695
3696 Py_INCREF(pattern);
3697 match->pattern = pattern;
3698
3699 Py_INCREF(state->string);
3700 match->string = state->string;
3701
3702 match->regs = NULL;
3703 match->groups = pattern->groups+1;
3704
3705 /* fill in group slices */
3706
3707 base = (char*) state->beginning;
3708 n = state->charsize;
3709
3710 match->mark[0] = ((char*) state->start - base) / n;
3711 match->mark[1] = ((char*) state->ptr - base) / n;
3712
3713 for (i = j = 0; i < pattern->groups; i++, j+=2)
3714 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3715 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3716 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3717 } else
3718 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3719
3720 match->pos = state->pos;
3721 match->endpos = state->endpos;
3722
3723 match->lastindex = state->lastindex;
3724
3725 return (PyObject*) match;
3726
3727 } else if (status == 0) {
3728
3729 /* no match */
3730 Py_INCREF(Py_None);
3731 return Py_None;
3732
3733 }
3734
3735 /* internal error */
3736 pattern_error(status);
3737 return NULL;
3738}
3739
3740
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003741/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003742/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003743
3744static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003745scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003746{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003747 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003748 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003749 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003750}
3751
3752static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003753scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003754{
3755 SRE_STATE* state = &self->state;
3756 PyObject* match;
3757 int status;
3758
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003759 state_reset(state);
3760
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003761 state->ptr = state->start;
3762
3763 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003764 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003765 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003766#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003767 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003768#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003769 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003770 if (PyErr_Occurred())
3771 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003772
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003773 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003774 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003775
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003776 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003777 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003778 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003779 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003780
3781 return match;
3782}
3783
3784
3785static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003786scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003787{
3788 SRE_STATE* state = &self->state;
3789 PyObject* match;
3790 int status;
3791
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003792 state_reset(state);
3793
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003794 state->ptr = state->start;
3795
3796 if (state->charsize == 1) {
3797 status = sre_search(state, PatternObject_GetCode(self->pattern));
3798 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003799#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003800 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003801#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003802 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003803 if (PyErr_Occurred())
3804 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003806 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003807 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003808
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003809 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003810 state->start = (void*) ((char*) state->ptr + state->charsize);
3811 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003812 state->start = state->ptr;
3813
3814 return match;
3815}
3816
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003817static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003818 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3819 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003820 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003821};
3822
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003823#define SCAN_OFF(x) offsetof(ScannerObject, x)
3824static PyMemberDef scanner_members[] = {
3825 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3826 {NULL} /* Sentinel */
3827};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003828
Neal Norwitz57c179c2006-03-22 07:18:02 +00003829static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003830 PyVarObject_HEAD_INIT(NULL, 0)
3831 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003832 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003833 (destructor)scanner_dealloc,/* tp_dealloc */
3834 0, /* tp_print */
3835 0, /* tp_getattr */
3836 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003837 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003838 0, /* tp_repr */
3839 0, /* tp_as_number */
3840 0, /* tp_as_sequence */
3841 0, /* tp_as_mapping */
3842 0, /* tp_hash */
3843 0, /* tp_call */
3844 0, /* tp_str */
3845 0, /* tp_getattro */
3846 0, /* tp_setattro */
3847 0, /* tp_as_buffer */
3848 Py_TPFLAGS_DEFAULT, /* tp_flags */
3849 0, /* tp_doc */
3850 0, /* tp_traverse */
3851 0, /* tp_clear */
3852 0, /* tp_richcompare */
3853 0, /* tp_weaklistoffset */
3854 0, /* tp_iter */
3855 0, /* tp_iternext */
3856 scanner_methods, /* tp_methods */
3857 scanner_members, /* tp_members */
3858 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003859};
3860
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003861static PyObject*
3862pattern_scanner(PatternObject* pattern, PyObject* args)
3863{
3864 /* create search state object */
3865
3866 ScannerObject* self;
3867
3868 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003869 Py_ssize_t start = 0;
3870 Py_ssize_t end = PY_SSIZE_T_MAX;
3871 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003872 return NULL;
3873
3874 /* create scanner object */
3875 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3876 if (!self)
3877 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003878 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003879
3880 string = state_init(&self->state, pattern, string, start, end);
3881 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003882 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003883 return NULL;
3884 }
3885
3886 Py_INCREF(pattern);
3887 self->pattern = (PyObject*) pattern;
3888
3889 return (PyObject*) self;
3890}
3891
Guido van Rossumb700df92000-03-31 14:59:30 +00003892static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003893 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003894 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003895 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003896 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003897};
3898
Martin v. Löwis1a214512008-06-11 05:26:20 +00003899static struct PyModuleDef sremodule = {
3900 PyModuleDef_HEAD_INIT,
3901 "_" SRE_MODULE,
3902 NULL,
3903 -1,
3904 _functions,
3905 NULL,
3906 NULL,
3907 NULL,
3908 NULL
3909};
3910
3911PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003912{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003913 PyObject* m;
3914 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003915 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003916
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003917 /* Patch object types */
3918 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3919 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003920 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003921
Martin v. Löwis1a214512008-06-11 05:26:20 +00003922 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003923 if (m == NULL)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003924 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003925 d = PyModule_GetDict(m);
3926
Christian Heimes217cfd12007-12-02 14:31:20 +00003927 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003928 if (x) {
3929 PyDict_SetItemString(d, "MAGIC", x);
3930 Py_DECREF(x);
3931 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003932
Christian Heimes217cfd12007-12-02 14:31:20 +00003933 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003934 if (x) {
3935 PyDict_SetItemString(d, "CODESIZE", x);
3936 Py_DECREF(x);
3937 }
3938
Neal Norwitzfe537132007-08-26 03:55:15 +00003939 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003940 if (x) {
3941 PyDict_SetItemString(d, "copyright", x);
3942 Py_DECREF(x);
3943 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003944 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003945}
3946
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003947#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003948
3949/* vim:ts=4:sw=4:et
3950*/