blob: 2b764b11cadcef2bfdf98c4985f8d5356dad996d [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
166#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000167
Mark Dickinson1f268282009-07-28 17:22:36 +0000168#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000169#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
170#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000171#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000172#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000173
174static unsigned int sre_lower_unicode(unsigned int ch)
175{
176 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
177}
178
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000179#endif
180
Guido van Rossumb700df92000-03-31 14:59:30 +0000181LOCAL(int)
182sre_category(SRE_CODE category, unsigned int ch)
183{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000184 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000185
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000186 case SRE_CATEGORY_DIGIT:
187 return SRE_IS_DIGIT(ch);
188 case SRE_CATEGORY_NOT_DIGIT:
189 return !SRE_IS_DIGIT(ch);
190 case SRE_CATEGORY_SPACE:
191 return SRE_IS_SPACE(ch);
192 case SRE_CATEGORY_NOT_SPACE:
193 return !SRE_IS_SPACE(ch);
194 case SRE_CATEGORY_WORD:
195 return SRE_IS_WORD(ch);
196 case SRE_CATEGORY_NOT_WORD:
197 return !SRE_IS_WORD(ch);
198 case SRE_CATEGORY_LINEBREAK:
199 return SRE_IS_LINEBREAK(ch);
200 case SRE_CATEGORY_NOT_LINEBREAK:
201 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000202
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000203 case SRE_CATEGORY_LOC_WORD:
204 return SRE_LOC_IS_WORD(ch);
205 case SRE_CATEGORY_LOC_NOT_WORD:
206 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000207
208#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000209 case SRE_CATEGORY_UNI_DIGIT:
210 return SRE_UNI_IS_DIGIT(ch);
211 case SRE_CATEGORY_UNI_NOT_DIGIT:
212 return !SRE_UNI_IS_DIGIT(ch);
213 case SRE_CATEGORY_UNI_SPACE:
214 return SRE_UNI_IS_SPACE(ch);
215 case SRE_CATEGORY_UNI_NOT_SPACE:
216 return !SRE_UNI_IS_SPACE(ch);
217 case SRE_CATEGORY_UNI_WORD:
218 return SRE_UNI_IS_WORD(ch);
219 case SRE_CATEGORY_UNI_NOT_WORD:
220 return !SRE_UNI_IS_WORD(ch);
221 case SRE_CATEGORY_UNI_LINEBREAK:
222 return SRE_UNI_IS_LINEBREAK(ch);
223 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
224 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000225#else
226 case SRE_CATEGORY_UNI_DIGIT:
227 return SRE_IS_DIGIT(ch);
228 case SRE_CATEGORY_UNI_NOT_DIGIT:
229 return !SRE_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_SPACE:
231 return SRE_IS_SPACE(ch);
232 case SRE_CATEGORY_UNI_NOT_SPACE:
233 return !SRE_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_WORD:
235 return SRE_LOC_IS_WORD(ch);
236 case SRE_CATEGORY_UNI_NOT_WORD:
237 return !SRE_LOC_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_LINEBREAK:
239 return SRE_IS_LINEBREAK(ch);
240 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
241 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000242#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000243 }
244 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000245}
246
247/* helpers */
248
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000250data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000251{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000254 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000255 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000256 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000257}
258
259static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000260data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000261{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000262 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 minsize = state->data_stack_base+size;
264 cursize = state->data_stack_size;
265 if (cursize < minsize) {
266 void* stack;
267 cursize = minsize+minsize/4+1024;
268 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000270 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272 return SRE_ERROR_MEMORY;
273 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000276 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000277 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000278}
279
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000280/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000281
282#define SRE_CHAR unsigned char
283#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000284#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000285#define SRE_CHARSET sre_charset
286#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000287#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000288#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000289#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000290#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000291
292#if defined(HAVE_UNICODE)
293
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000295#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000297
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000298#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000299#undef SRE_SEARCH
300#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000301#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000302#undef SRE_INFO
303#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000304#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000305#undef SRE_AT
306#undef SRE_CHAR
307
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000308/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000309
310#define SRE_CHAR Py_UNICODE
311#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000312#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000313#define SRE_CHARSET sre_ucharset
314#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000315#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000316#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000317#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000318#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000319#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000320
321#endif /* SRE_RECURSIVE */
322
323/* -------------------------------------------------------------------- */
324/* String matching engine */
325
326/* the following section is compiled twice, with different character
327 settings */
328
329LOCAL(int)
330SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
331{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000332 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000333
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000334 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000336 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000338 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000339 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000340 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000341
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000342 case SRE_AT_BEGINNING_LINE:
343 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000344 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000345
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000346 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000347 return (((void*) (ptr+1) == state->end &&
348 SRE_IS_LINEBREAK((int) ptr[0])) ||
349 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000350
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000351 case SRE_AT_END_LINE:
352 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000353 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000354
Fredrik Lundh770617b2001-01-14 15:06:11 +0000355 case SRE_AT_END_STRING:
356 return ((void*) ptr == state->end);
357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_BOUNDARY:
359 if (state->beginning == state->end)
360 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000362 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000364 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000366
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000367 case SRE_AT_NON_BOUNDARY:
368 if (state->beginning == state->end)
369 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000373 SRE_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000375
376 case SRE_AT_LOC_BOUNDARY:
377 if (state->beginning == state->end)
378 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000380 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000384
385 case SRE_AT_LOC_NON_BOUNDARY:
386 if (state->beginning == state->end)
387 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000390 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000392 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000393
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000394#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000395 case SRE_AT_UNI_BOUNDARY:
396 if (state->beginning == state->end)
397 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000398 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000399 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000400 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000401 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000402 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000403
404 case SRE_AT_UNI_NON_BOUNDARY:
405 if (state->beginning == state->end)
406 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000407 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000409 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000411 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000412#endif
413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000414 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000415
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000416 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000417}
418
419LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000420SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000421{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000423
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000424 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000425
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000426 for (;;) {
427 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000428
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000429 case SRE_OP_FAILURE:
430 return !ok;
431
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000432 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000433 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000434 if (ch == set[0])
435 return ok;
436 set++;
437 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000438
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000439 case SRE_OP_CATEGORY:
440 /* <CATEGORY> <code> */
441 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000442 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000443 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Fredrik Lundh3562f112000-07-02 12:00:07 +0000446 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000447 if (sizeof(SRE_CODE) == 2) {
448 /* <CHARSET> <bitmap> (16 bits per code word) */
449 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
450 return ok;
451 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000452 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000453 else {
454 /* <CHARSET> <bitmap> (32 bits per code word) */
455 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
456 return ok;
457 set += 8;
458 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000459 break;
460
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000461 case SRE_OP_RANGE:
462 /* <RANGE> <lower> <upper> */
463 if (set[0] <= ch && ch <= set[1])
464 return ok;
465 set += 2;
466 break;
467
468 case SRE_OP_NEGATE:
469 ok = !ok;
470 break;
471
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000472 case SRE_OP_BIGCHARSET:
473 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
474 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000475 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000476 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000477
478 if (sizeof(SRE_CODE) == 2) {
479 block = ((unsigned char*)set)[ch >> 8];
480 set += 128;
481 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
482 return ok;
483 set += count*16;
484 }
485 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000486 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
487 * warnings when c's type supports only numbers < N+1 */
488 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000489 block = ((unsigned char*)set)[ch >> 8];
490 else
491 block = -1;
492 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000493 if (block >=0 &&
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000494 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
495 return ok;
496 set += count*8;
497 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000498 break;
499 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000500
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 default:
502 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000503 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 return 0;
505 }
506 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000507}
508
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000509LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000510
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000511LOCAL(Py_ssize_t)
512SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513{
514 SRE_CODE chr;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000515 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
516 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000517 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000518
519 /* adjust end */
520 if (maxcount < end - ptr && maxcount != 65535)
521 end = ptr + maxcount;
522
523 switch (pattern[0]) {
524
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000525 case SRE_OP_IN:
526 /* repeated set */
527 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
528 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
529 ptr++;
530 break;
531
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 case SRE_OP_ANY:
533 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000534 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000535 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
536 ptr++;
537 break;
538
539 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000540 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000542 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000543 ptr = end;
544 break;
545
546 case SRE_OP_LITERAL:
547 /* repeated literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 while (ptr < end && (SRE_CODE) *ptr == chr)
551 ptr++;
552 break;
553
554 case SRE_OP_LITERAL_IGNORE:
555 /* repeated literal */
556 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000557 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000558 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
559 ptr++;
560 break;
561
562 case SRE_OP_NOT_LITERAL:
563 /* repeated non-literal */
564 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000565 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000566 while (ptr < end && (SRE_CODE) *ptr != chr)
567 ptr++;
568 break;
Tim Peters3d563502006-01-21 02:47:53 +0000569
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000570 case SRE_OP_NOT_LITERAL_IGNORE:
571 /* repeated non-literal */
572 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000573 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000574 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
575 ptr++;
576 break;
577
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000578 default:
579 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000582 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000583 if (i < 0)
584 return i;
585 if (!i)
586 break;
587 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000588 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
589 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 return (SRE_CHAR*) state->ptr - ptr;
591 }
592
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000593 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000594 return ptr - (SRE_CHAR*) state->ptr;
595}
596
Fredrik Lundh33accc12000-08-27 20:59:47 +0000597#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000598LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000599SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
600{
601 /* check if an SRE_OP_INFO block matches at the current position.
602 returns the number of SRE_CODE objects to skip if successful, 0
603 if no match */
604
605 SRE_CHAR* end = state->end;
606 SRE_CHAR* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000607 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000608
609 /* check minimal length */
610 if (pattern[3] && (end - ptr) < pattern[3])
611 return 0;
612
613 /* check known prefix */
614 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
615 /* <length> <skip> <prefix data> <overlap data> */
616 for (i = 0; i < pattern[5]; i++)
617 if ((SRE_CODE) ptr[i] != pattern[7 + i])
618 return 0;
619 return pattern[0] + 2 * pattern[6];
620 }
621 return pattern[0];
622}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000623#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000624
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000625/* The macros below should be used to protect recursive SRE_MATCH()
626 * calls that *failed* and do *not* return immediately (IOW, those
627 * that will backtrack). Explaining:
628 *
629 * - Recursive SRE_MATCH() returned true: that's usually a success
630 * (besides atypical cases like ASSERT_NOT), therefore there's no
631 * reason to restore lastmark;
632 *
633 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
634 * is returning to the caller: If the current SRE_MATCH() is the
635 * top function of the recursion, returning false will be a matching
636 * failure, and it doesn't matter where lastmark is pointing to.
637 * If it's *not* the top function, it will be a recursive SRE_MATCH()
638 * failure by itself, and the calling SRE_MATCH() will have to deal
639 * with the failure by the same rules explained here (it will restore
640 * lastmark by itself if necessary);
641 *
642 * - Recursive SRE_MATCH() returned false, and will continue the
643 * outside 'for' loop: must be protected when breaking, since the next
644 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000645 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000646 * - Recursive SRE_MATCH() returned false, and will be called again
647 * inside a local for/while loop: must be protected between each
648 * loop iteration, since the recursive SRE_MATCH() could do anything,
649 * and could potentially depend on lastmark.
650 *
651 * For more information, check the discussion at SF patch #712900.
652 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000653#define LASTMARK_SAVE() \
654 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000655 ctx->lastmark = state->lastmark; \
656 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000657 } while (0)
658#define LASTMARK_RESTORE() \
659 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000660 state->lastmark = ctx->lastmark; \
661 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000662 } while (0)
663
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000664#define RETURN_ERROR(i) do { return i; } while(0)
665#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
666#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
667
668#define RETURN_ON_ERROR(i) \
669 do { if (i < 0) RETURN_ERROR(i); } while (0)
670#define RETURN_ON_SUCCESS(i) \
671 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
672#define RETURN_ON_FAILURE(i) \
673 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
674
675#define SFY(x) #x
676
677#define DATA_STACK_ALLOC(state, type, ptr) \
678do { \
679 alloc_pos = state->data_stack_base; \
680 TRACE(("allocating %s in %d (%d)\n", \
681 SFY(type), alloc_pos, sizeof(type))); \
682 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
683 int j = data_stack_grow(state, sizeof(type)); \
684 if (j < 0) return j; \
685 if (ctx_pos != -1) \
686 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
687 } \
688 ptr = (type*)(state->data_stack+alloc_pos); \
689 state->data_stack_base += sizeof(type); \
690} while (0)
691
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000692#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
693do { \
694 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
695 ptr = (type*)(state->data_stack+pos); \
696} while (0)
697
698#define DATA_STACK_PUSH(state, data, size) \
699do { \
700 TRACE(("copy data in %p to %d (%d)\n", \
701 data, state->data_stack_base, size)); \
702 if (state->data_stack_size < state->data_stack_base+size) { \
703 int j = data_stack_grow(state, size); \
704 if (j < 0) return j; \
705 if (ctx_pos != -1) \
706 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
707 } \
708 memcpy(state->data_stack+state->data_stack_base, data, size); \
709 state->data_stack_base += size; \
710} while (0)
711
712#define DATA_STACK_POP(state, data, size, discard) \
713do { \
714 TRACE(("copy data to %p from %d (%d)\n", \
715 data, state->data_stack_base-size, size)); \
716 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
717 if (discard) \
718 state->data_stack_base -= size; \
719} while (0)
720
721#define DATA_STACK_POP_DISCARD(state, size) \
722do { \
723 TRACE(("discard data from %d (%d)\n", \
724 state->data_stack_base-size, size)); \
725 state->data_stack_base -= size; \
726} while(0)
727
728#define DATA_PUSH(x) \
729 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
730#define DATA_POP(x) \
731 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000732#define DATA_POP_DISCARD(x) \
733 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
734#define DATA_ALLOC(t,p) \
735 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000736#define DATA_LOOKUP_AT(t,p,pos) \
737 DATA_STACK_LOOKUP_AT(state,t,p,pos)
738
739#define MARK_PUSH(lastmark) \
740 do if (lastmark > 0) { \
741 i = lastmark; /* ctx->lastmark may change if reallocated */ \
742 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
743 } while (0)
744#define MARK_POP(lastmark) \
745 do if (lastmark > 0) { \
746 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
747 } while (0)
748#define MARK_POP_KEEP(lastmark) \
749 do if (lastmark > 0) { \
750 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
751 } while (0)
752#define MARK_POP_DISCARD(lastmark) \
753 do if (lastmark > 0) { \
754 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
755 } while (0)
756
757#define JUMP_NONE 0
758#define JUMP_MAX_UNTIL_1 1
759#define JUMP_MAX_UNTIL_2 2
760#define JUMP_MAX_UNTIL_3 3
761#define JUMP_MIN_UNTIL_1 4
762#define JUMP_MIN_UNTIL_2 5
763#define JUMP_MIN_UNTIL_3 6
764#define JUMP_REPEAT 7
765#define JUMP_REPEAT_ONE_1 8
766#define JUMP_REPEAT_ONE_2 9
767#define JUMP_MIN_REPEAT_ONE 10
768#define JUMP_BRANCH 11
769#define JUMP_ASSERT 12
770#define JUMP_ASSERT_NOT 13
771
772#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
773 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
774 nextctx->last_ctx_pos = ctx_pos; \
775 nextctx->jump = jumpvalue; \
776 nextctx->pattern = nextpattern; \
777 ctx_pos = alloc_pos; \
778 ctx = nextctx; \
779 goto entrance; \
780 jumplabel: \
781 while (0) /* gcc doesn't like labels at end of scopes */ \
782
783typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000784 Py_ssize_t last_ctx_pos;
785 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000786 SRE_CHAR* ptr;
787 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000788 Py_ssize_t count;
789 Py_ssize_t lastmark;
790 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000791 union {
792 SRE_CODE chr;
793 SRE_REPEAT* rep;
794 } u;
795} SRE_MATCH_CONTEXT;
796
797/* check if string matches the given pattern. returns <0 for
798 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000799LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000800SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000801{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000802 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000803 Py_ssize_t alloc_pos, ctx_pos = -1;
804 Py_ssize_t i, ret = 0;
805 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000806 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000807
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000808 SRE_MATCH_CONTEXT* ctx;
809 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000811 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000813 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
814 ctx->last_ctx_pos = -1;
815 ctx->jump = JUMP_NONE;
816 ctx->pattern = pattern;
817 ctx_pos = alloc_pos;
818
819entrance:
820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000821 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000822
823 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000824 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000825 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000826 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000827 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000828 (end - ctx->ptr), ctx->pattern[3]));
829 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000830 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000832 }
833
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000834 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000835 ++sigcount;
836 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
837 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000838
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000839 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000840
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000841 case SRE_OP_MARK:
842 /* set mark */
843 /* <MARK> <gid> */
844 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
845 ctx->ptr, ctx->pattern[0]));
846 i = ctx->pattern[0];
847 if (i & 1)
848 state->lastindex = i/2 + 1;
849 if (i > state->lastmark) {
850 /* state->lastmark is the highest valid index in the
851 state->mark array. If it is increased by more than 1,
852 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000853 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000854 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000855 while (j < i)
856 state->mark[j++] = NULL;
857 state->lastmark = i;
858 }
859 state->mark[i] = ctx->ptr;
860 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000861 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000862
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 case SRE_OP_LITERAL:
864 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000865 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000866 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
867 ctx->ptr, *ctx->pattern));
868 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
869 RETURN_FAILURE;
870 ctx->pattern++;
871 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000872 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000873
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 case SRE_OP_NOT_LITERAL:
875 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000876 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000877 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
878 ctx->ptr, *ctx->pattern));
879 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
880 RETURN_FAILURE;
881 ctx->pattern++;
882 ctx->ptr++;
883 break;
884
885 case SRE_OP_SUCCESS:
886 /* end of pattern */
887 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
888 state->ptr = ctx->ptr;
889 RETURN_SUCCESS;
890
891 case SRE_OP_AT:
892 /* match at given position */
893 /* <AT> <code> */
894 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
895 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
896 RETURN_FAILURE;
897 ctx->pattern++;
898 break;
899
900 case SRE_OP_CATEGORY:
901 /* match at given category */
902 /* <CATEGORY> <code> */
903 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
904 ctx->ptr, *ctx->pattern));
905 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
906 RETURN_FAILURE;
907 ctx->pattern++;
908 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000909 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000910
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000911 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000912 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000913 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
915 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
916 RETURN_FAILURE;
917 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000918 break;
919
920 case SRE_OP_ANY_ALL:
921 /* match anything */
922 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000923 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
924 if (ctx->ptr >= end)
925 RETURN_FAILURE;
926 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000927 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000929 case SRE_OP_IN:
930 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000931 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000932 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
933 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
934 RETURN_FAILURE;
935 ctx->pattern += ctx->pattern[0];
936 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000937 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000939 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
941 ctx->pattern, ctx->ptr, ctx->pattern[0]));
942 if (ctx->ptr >= end ||
943 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
944 RETURN_FAILURE;
945 ctx->pattern++;
946 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000947 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000948
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000949 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
951 ctx->pattern, ctx->ptr, *ctx->pattern));
952 if (ctx->ptr >= end ||
953 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
954 RETURN_FAILURE;
955 ctx->pattern++;
956 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000957 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000958
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000959 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000960 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
961 if (ctx->ptr >= end
962 || !SRE_CHARSET(ctx->pattern+1,
963 (SRE_CODE)state->lower(*ctx->ptr)))
964 RETURN_FAILURE;
965 ctx->pattern += ctx->pattern[0];
966 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000967 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000968
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000969 case SRE_OP_JUMP:
970 case SRE_OP_INFO:
971 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000972 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
974 ctx->ptr, ctx->pattern[0]));
975 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 break;
977
978 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* alternation */
980 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000981 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000982 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 ctx->u.rep = state->repeat;
984 if (ctx->u.rep)
985 MARK_PUSH(ctx->lastmark);
986 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
987 if (ctx->pattern[1] == SRE_OP_LITERAL &&
988 (ctx->ptr >= end ||
989 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000990 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000991 if (ctx->pattern[1] == SRE_OP_IN &&
992 (ctx->ptr >= end ||
993 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000994 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000995 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 if (ret) {
998 if (ctx->u.rep)
999 MARK_POP_DISCARD(ctx->lastmark);
1000 RETURN_ON_ERROR(ret);
1001 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001002 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 if (ctx->u.rep)
1004 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001005 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001006 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001007 if (ctx->u.rep)
1008 MARK_POP_DISCARD(ctx->lastmark);
1009 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001010
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001011 case SRE_OP_REPEAT_ONE:
1012 /* match repeated sequence (maximizing regexp) */
1013
1014 /* this operator only works if the repeated item is
1015 exactly one character wide, and we're not already
1016 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001017 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
1019 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1020
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1022 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001023
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001024 if (ctx->ptr + ctx->pattern[1] > end)
1025 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001029 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1030 RETURN_ON_ERROR(ret);
1031 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1032 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001033 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001034
1035 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 string. check if the rest of the pattern matches,
1038 and backtrack if not. */
1039
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001040 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001042
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001045 state->ptr = ctx->ptr;
1046 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001047 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001048
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001049 LASTMARK_SAVE();
1050
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 /* tail starts with a literal. skip positions where
1053 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001056 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1058 ctx->ptr--;
1059 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001060 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001061 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1065 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001066 if (ret) {
1067 RETURN_ON_ERROR(ret);
1068 RETURN_SUCCESS;
1069 }
Tim Peters3d563502006-01-21 02:47:53 +00001070
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001071 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001072
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 ctx->ptr--;
1074 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 }
1076
1077 } else {
1078 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001079 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1082 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001083 if (ret) {
1084 RETURN_ON_ERROR(ret);
1085 RETURN_SUCCESS;
1086 }
1087 ctx->ptr--;
1088 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001089 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001090 }
1091 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001092 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001093
Guido van Rossum41c99e72003-04-14 17:59:34 +00001094 case SRE_OP_MIN_REPEAT_ONE:
1095 /* match repeated sequence (minimizing regexp) */
1096
1097 /* this operator only works if the repeated item is
1098 exactly one character wide, and we're not already
1099 collecting backtracking points. for other cases,
1100 use the MIN_REPEAT operator */
1101
1102 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1105 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001106
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001107 if (ctx->ptr + ctx->pattern[1] > end)
1108 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001109
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001110 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001111
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001112 if (ctx->pattern[1] == 0)
1113 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001114 else {
1115 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001116 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1117 RETURN_ON_ERROR(ret);
1118 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001119 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001120 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 RETURN_FAILURE;
1122 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001125 }
1126
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001128 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131
1132 } else {
1133 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001134 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001135 while ((Py_ssize_t)ctx->pattern[2] == 65535
1136 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001137 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1143 }
1144 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001146 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001147 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001149 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 assert(ret == 1);
1151 ctx->ptr++;
1152 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001153 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001154 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001155 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001158 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001159 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001160 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001161 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001162 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1163 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001166 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001167 if (!ctx->u.rep) {
1168 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001169 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001170 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001171 ctx->u.rep->count = -1;
1172 ctx->u.rep->pattern = ctx->pattern;
1173 ctx->u.rep->prev = state->repeat;
1174 ctx->u.rep->last_ptr = NULL;
1175 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001176
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001177 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001179 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001180 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001181
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001182 if (ret) {
1183 RETURN_ON_ERROR(ret);
1184 RETURN_SUCCESS;
1185 }
1186 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001187
1188 case SRE_OP_MAX_UNTIL:
1189 /* maximizing repeat */
1190 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1191
1192 /* FIXME: we probably need to deal with zero-width
1193 matches in here... */
1194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 ctx->u.rep = state->repeat;
1196 if (!ctx->u.rep)
1197 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001198
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001201 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001202
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001203 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1204 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001209 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1210 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001211 if (ret) {
1212 RETURN_ON_ERROR(ret);
1213 RETURN_SUCCESS;
1214 }
1215 ctx->u.rep->count = ctx->count-1;
1216 state->ptr = ctx->ptr;
1217 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001218 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001219
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001220 if ((ctx->count < ctx->u.rep->pattern[2] ||
1221 ctx->u.rep->pattern[2] == 65535) &&
1222 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001223 /* we may have enough matches, but if we can
1224 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001226 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 MARK_PUSH(ctx->lastmark);
1228 /* zero-width match protection */
1229 DATA_PUSH(&ctx->u.rep->last_ptr);
1230 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001231 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1232 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001233 DATA_POP(&ctx->u.rep->last_ptr);
1234 if (ret) {
1235 MARK_POP_DISCARD(ctx->lastmark);
1236 RETURN_ON_ERROR(ret);
1237 RETURN_SUCCESS;
1238 }
1239 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001240 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 ctx->u.rep->count = ctx->count-1;
1242 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001243 }
1244
1245 /* cannot match more repeated items here. make sure the
1246 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001247 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001249 RETURN_ON_SUCCESS(ret);
1250 state->repeat = ctx->u.rep;
1251 state->ptr = ctx->ptr;
1252 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001253
1254 case SRE_OP_MIN_UNTIL:
1255 /* minimizing repeat */
1256 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1257
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001258 ctx->u.rep = state->repeat;
1259 if (!ctx->u.rep)
1260 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001261
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001263
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001265
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001266 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1267 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001270 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001272 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1273 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001274 if (ret) {
1275 RETURN_ON_ERROR(ret);
1276 RETURN_SUCCESS;
1277 }
1278 ctx->u.rep->count = ctx->count-1;
1279 state->ptr = ctx->ptr;
1280 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281 }
1282
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001283 LASTMARK_SAVE();
1284
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001285 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001286 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001288 if (ret) {
1289 RETURN_ON_ERROR(ret);
1290 RETURN_SUCCESS;
1291 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001292
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 state->repeat = ctx->u.rep;
1294 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001295
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001296 LASTMARK_RESTORE();
1297
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001298 if (ctx->count >= ctx->u.rep->pattern[2]
1299 && ctx->u.rep->pattern[2] != 65535)
1300 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001301
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001302 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001303 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1304 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (ret) {
1306 RETURN_ON_ERROR(ret);
1307 RETURN_SUCCESS;
1308 }
1309 ctx->u.rep->count = ctx->count-1;
1310 state->ptr = ctx->ptr;
1311 RETURN_FAILURE;
1312
1313 case SRE_OP_GROUPREF:
1314 /* match backreference */
1315 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1316 ctx->ptr, ctx->pattern[0]));
1317 i = ctx->pattern[0];
1318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001319 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001320 if (groupref >= state->lastmark) {
1321 RETURN_FAILURE;
1322 } else {
1323 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1324 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1325 if (!p || !e || e < p)
1326 RETURN_FAILURE;
1327 while (p < e) {
1328 if (ctx->ptr >= end || *ctx->ptr != *p)
1329 RETURN_FAILURE;
1330 p++; ctx->ptr++;
1331 }
1332 }
1333 }
1334 ctx->pattern++;
1335 break;
1336
1337 case SRE_OP_GROUPREF_IGNORE:
1338 /* match backreference */
1339 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1340 ctx->ptr, ctx->pattern[0]));
1341 i = ctx->pattern[0];
1342 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001343 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001344 if (groupref >= state->lastmark) {
1345 RETURN_FAILURE;
1346 } else {
1347 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1348 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1349 if (!p || !e || e < p)
1350 RETURN_FAILURE;
1351 while (p < e) {
1352 if (ctx->ptr >= end ||
1353 state->lower(*ctx->ptr) != state->lower(*p))
1354 RETURN_FAILURE;
1355 p++; ctx->ptr++;
1356 }
1357 }
1358 }
1359 ctx->pattern++;
1360 break;
1361
1362 case SRE_OP_GROUPREF_EXISTS:
1363 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1364 ctx->ptr, ctx->pattern[0]));
1365 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1366 i = ctx->pattern[0];
1367 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001368 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001369 if (groupref >= state->lastmark) {
1370 ctx->pattern += ctx->pattern[1];
1371 break;
1372 } else {
1373 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1374 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1375 if (!p || !e || e < p) {
1376 ctx->pattern += ctx->pattern[1];
1377 break;
1378 }
1379 }
1380 }
1381 ctx->pattern += 2;
1382 break;
1383
1384 case SRE_OP_ASSERT:
1385 /* assert subpattern */
1386 /* <ASSERT> <skip> <back> <pattern> */
1387 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1388 ctx->ptr, ctx->pattern[1]));
1389 state->ptr = ctx->ptr - ctx->pattern[1];
1390 if (state->ptr < state->beginning)
1391 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001392 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001393 RETURN_ON_FAILURE(ret);
1394 ctx->pattern += ctx->pattern[0];
1395 break;
1396
1397 case SRE_OP_ASSERT_NOT:
1398 /* assert not subpattern */
1399 /* <ASSERT_NOT> <skip> <back> <pattern> */
1400 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1401 ctx->ptr, ctx->pattern[1]));
1402 state->ptr = ctx->ptr - ctx->pattern[1];
1403 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001404 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001405 if (ret) {
1406 RETURN_ON_ERROR(ret);
1407 RETURN_FAILURE;
1408 }
1409 }
1410 ctx->pattern += ctx->pattern[0];
1411 break;
1412
1413 case SRE_OP_FAILURE:
1414 /* immediate failure */
1415 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1416 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001417
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001418 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001419 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1420 ctx->pattern[-1]));
1421 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001422 }
1423 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001424
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001425exit:
1426 ctx_pos = ctx->last_ctx_pos;
1427 jump = ctx->jump;
1428 DATA_POP_DISCARD(ctx);
1429 if (ctx_pos == -1)
1430 return ret;
1431 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1432
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001433 switch (jump) {
1434 case JUMP_MAX_UNTIL_2:
1435 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1436 goto jump_max_until_2;
1437 case JUMP_MAX_UNTIL_3:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_3;
1440 case JUMP_MIN_UNTIL_2:
1441 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1442 goto jump_min_until_2;
1443 case JUMP_MIN_UNTIL_3:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_3;
1446 case JUMP_BRANCH:
1447 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1448 goto jump_branch;
1449 case JUMP_MAX_UNTIL_1:
1450 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1451 goto jump_max_until_1;
1452 case JUMP_MIN_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_min_until_1;
1455 case JUMP_REPEAT:
1456 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1457 goto jump_repeat;
1458 case JUMP_REPEAT_ONE_1:
1459 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat_one_1;
1461 case JUMP_REPEAT_ONE_2:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_2;
1464 case JUMP_MIN_REPEAT_ONE:
1465 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1466 goto jump_min_repeat_one;
1467 case JUMP_ASSERT:
1468 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1469 goto jump_assert;
1470 case JUMP_ASSERT_NOT:
1471 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert_not;
1473 case JUMP_NONE:
1474 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1475 break;
1476 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001477
1478 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001479}
1480
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001481LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001482SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1483{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001484 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1485 SRE_CHAR* end = (SRE_CHAR *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001486 Py_ssize_t status = 0;
1487 Py_ssize_t prefix_len = 0;
1488 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001489 SRE_CODE* prefix = NULL;
1490 SRE_CODE* charset = NULL;
1491 SRE_CODE* overlap = NULL;
1492 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001493
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001494 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001495 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001496 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001497
1498 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001499
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001500 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001501 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001502 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001503 end -= pattern[3]-1;
1504 if (end <= ptr)
1505 end = ptr+1;
1506 }
1507
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001509 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001510 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001511 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001512 prefix_skip = pattern[6];
1513 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 overlap = prefix + prefix_len - 1;
1515 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001516 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001518 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001519
1520 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001521 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001522
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001523 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1524 TRACE(("charset = %p\n", charset));
1525
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001527 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001528 /* pattern starts with a known prefix. use the overlap
1529 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001530 Py_ssize_t i = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001531 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001532 while (ptr < end) {
1533 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001534 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 if (!i)
1536 break;
1537 else
1538 i = overlap[i];
1539 } else {
1540 if (++i == prefix_len) {
1541 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001542 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1543 state->start = ptr + 1 - prefix_len;
1544 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001545 if (flags & SRE_INFO_LITERAL)
1546 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001547 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001548 if (status != 0)
1549 return status;
1550 /* close but no cigar -- try again */
1551 i = overlap[i];
1552 }
1553 break;
1554 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001555 }
1556 ptr++;
1557 }
1558 return 0;
1559 }
1560#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001561
Fredrik Lundh3562f112000-07-02 12:00:07 +00001562 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001564 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 SRE_CODE chr = pattern[1];
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001567 for (;;) {
1568 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1569 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001570 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001572 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001573 state->start = ptr;
1574 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001575 if (flags & SRE_INFO_LITERAL)
1576 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001577 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 if (status != 0)
1579 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001580 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001581 } else if (charset) {
1582 /* pattern starts with a character from a known set */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001583 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001584 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001585 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001587 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001589 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 state->start = ptr;
1591 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001592 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 if (status != 0)
1594 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001595 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 }
1597 } else
1598 /* general case */
1599 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001600 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001602 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 if (status != 0)
1604 break;
1605 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001606
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001607 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001608}
Tim Peters3d563502006-01-21 02:47:53 +00001609
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001610LOCAL(int)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001611SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001612{
1613 /* check if given string is a literal template (i.e. no escapes) */
1614 while (len-- > 0)
1615 if (*ptr++ == '\\')
1616 return 0;
1617 return 1;
1618}
Guido van Rossumb700df92000-03-31 14:59:30 +00001619
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001620#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001621
1622/* -------------------------------------------------------------------- */
1623/* factories and destructors */
1624
1625/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001626static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1627static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001628
1629static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001630sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001631{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001632 return Py_BuildValue("l", sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001633}
1634
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001636sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001637{
1638 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001639 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001640 return NULL;
1641 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001642 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001643 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001644#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001645 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001646#else
1647 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001648#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001650}
1651
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001652LOCAL(void)
1653state_reset(SRE_STATE* state)
1654{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001656 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001657
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001658 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659 state->lastindex = -1;
1660
1661 state->repeat = NULL;
1662
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664}
1665
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001666static void*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001667getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001668{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001669 /* given a python object, return a data pointer, a length (in
1670 characters), and a character size. return NULL if the object
1671 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001672
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001673 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001674 Py_ssize_t size, bytes;
1675 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001676 void* ptr;
Travis E. Oliphant8ae62b62007-09-23 02:00:13 +00001677 Py_buffer view;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001678
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001679 /* Unicode objects do not support the buffer API. So, get the data
1680 directly instead. */
1681 if (PyUnicode_Check(string)) {
1682 ptr = (void *)PyUnicode_AS_DATA(string);
1683 *p_length = PyUnicode_GET_SIZE(string);
1684 *p_charsize = sizeof(Py_UNICODE);
1685 return ptr;
1686 }
1687
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001688 /* get pointer to string buffer */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001689 view.len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001690 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001691 if (!buffer || !buffer->bf_getbuffer ||
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001692 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1693 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1694 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001695 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001696
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 /* determine buffer size */
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001698 bytes = view.len;
1699 ptr = view.buf;
1700
1701 /* Release the buffer immediately --- possibly dangerous
1702 but doing something else would require some re-factoring
1703 */
Martin v. Löwis423be952008-08-13 15:53:07 +00001704 PyBuffer_Release(&view);
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001705
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001707 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1708 return NULL;
1709 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001710
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001711 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001712 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001713
Christian Heimes72b710a2008-05-26 13:28:38 +00001714 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001715 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001716#if defined(HAVE_UNICODE)
Antoine Pitroufd036452008-08-19 17:56:33 +00001717 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001718 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001719#endif
1720 else {
1721 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1722 return NULL;
1723 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001724
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001725 *p_length = size;
1726 *p_charsize = charsize;
1727
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001728 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001729 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001730 "Buffer is NULL");
1731 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001732 return ptr;
1733}
1734
1735LOCAL(PyObject*)
1736state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001737 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001738{
1739 /* prepare state object */
1740
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001741 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001742 int charsize;
1743 void* ptr;
1744
1745 memset(state, 0, sizeof(SRE_STATE));
1746
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001747 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001748 state->lastindex = -1;
1749
1750 ptr = getstring(string, &length, &charsize);
1751 if (!ptr)
1752 return NULL;
1753
Antoine Pitroufd036452008-08-19 17:56:33 +00001754 if (charsize == 1 && pattern->charsize > 1) {
1755 PyErr_SetString(PyExc_TypeError,
1756 "can't use a string pattern on a bytes-like object");
1757 return NULL;
1758 }
1759 if (charsize > 1 && pattern->charsize == 1) {
1760 PyErr_SetString(PyExc_TypeError,
1761 "can't use a bytes pattern on a string-like object");
1762 return NULL;
1763 }
1764
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001765 /* adjust boundaries */
1766 if (start < 0)
1767 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001768 else if (start > length)
1769 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 if (end < 0)
1772 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001773 else if (end > length)
1774 end = length;
1775
1776 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001777
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001778 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001779
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001780 state->start = (void*) ((char*) ptr + start * state->charsize);
1781 state->end = (void*) ((char*) ptr + end * state->charsize);
1782
1783 Py_INCREF(string);
1784 state->string = string;
1785 state->pos = start;
1786 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001787
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001788 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001789 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001790 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001791#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001792 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001793#else
1794 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001795#endif
1796 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001797 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001798
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001799 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001800}
1801
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001802LOCAL(void)
1803state_fini(SRE_STATE* state)
1804{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001805 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001806 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001807}
1808
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001809/* calculate offset from start of string */
1810#define STATE_OFFSET(state, member)\
1811 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1812
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001813LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001814state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001815{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001816 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001817
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001818 index = (index - 1) * 2;
1819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001820 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001821 if (empty)
1822 /* want empty string */
1823 i = j = 0;
1824 else {
1825 Py_INCREF(Py_None);
1826 return Py_None;
1827 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001828 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001829 i = STATE_OFFSET(state, state->mark[index]);
1830 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001831 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001832
Fredrik Lundh58100642000-08-09 09:14:35 +00001833 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001834}
1835
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001836static void
1837pattern_error(int status)
1838{
1839 switch (status) {
1840 case SRE_ERROR_RECURSION_LIMIT:
1841 PyErr_SetString(
1842 PyExc_RuntimeError,
1843 "maximum recursion limit exceeded"
1844 );
1845 break;
1846 case SRE_ERROR_MEMORY:
1847 PyErr_NoMemory();
1848 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001849 case SRE_ERROR_INTERRUPTED:
1850 /* An exception has already been raised, so let it fly */
1851 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001852 default:
1853 /* other error codes indicate compiler/engine bugs */
1854 PyErr_SetString(
1855 PyExc_RuntimeError,
1856 "internal error in regular expression engine"
1857 );
1858 }
1859}
1860
Guido van Rossumb700df92000-03-31 14:59:30 +00001861static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001862pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001863{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001864 if (self->weakreflist != NULL)
1865 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001866 Py_XDECREF(self->pattern);
1867 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001868 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001869 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001870}
1871
1872static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001873pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001874{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001875 SRE_STATE state;
1876 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001877
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001878 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001879 Py_ssize_t start = 0;
1880 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001881 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001882 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001883 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001884 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001885
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 string = state_init(&state, self, string, start, end);
1887 if (!string)
1888 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001889
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001890 state.ptr = state.start;
1891
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001892 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1893
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001894 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001895 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001896 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001897#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001898 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001899#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001900 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001901
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001902 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001903 if (PyErr_Occurred())
1904 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001909}
1910
1911static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001912pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001913{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001914 SRE_STATE state;
1915 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001916
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001917 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001918 Py_ssize_t start = 0;
1919 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001920 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001921 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001922 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 string = state_init(&state, self, string, start, end);
1926 if (!string)
1927 return NULL;
1928
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001929 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1930
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001931 if (state.charsize == 1) {
1932 status = sre_search(&state, PatternObject_GetCode(self));
1933 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001934#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001935 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001936#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001937 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001938
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001939 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1940
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001941 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001942
Thomas Wouters89f507f2006-12-13 04:49:30 +00001943 if (PyErr_Occurred())
1944 return NULL;
1945
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001946 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001947}
1948
1949static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001950call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001951{
1952 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001953 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954 PyObject* func;
1955 PyObject* result;
1956
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001957 if (!args)
1958 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001959 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001960 if (!name)
1961 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001962 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001964 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001965 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001966 func = PyObject_GetAttrString(mod, function);
1967 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001968 if (!func)
1969 return NULL;
1970 result = PyObject_CallObject(func, args);
1971 Py_DECREF(func);
1972 Py_DECREF(args);
1973 return result;
1974}
1975
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001976#ifdef USE_BUILTIN_COPY
1977static int
1978deepcopy(PyObject** object, PyObject* memo)
1979{
1980 PyObject* copy;
1981
1982 copy = call(
1983 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001984 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001985 );
1986 if (!copy)
1987 return 0;
1988
1989 Py_DECREF(*object);
1990 *object = copy;
1991
1992 return 1; /* success */
1993}
1994#endif
1995
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001996static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001997join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001998{
1999 /* join list elements */
2000
2001 PyObject* joiner;
2002#if PY_VERSION_HEX >= 0x01060000
2003 PyObject* function;
2004 PyObject* args;
2005#endif
2006 PyObject* result;
2007
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002008 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002009 if (!joiner)
2010 return NULL;
2011
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002012 if (PyList_GET_SIZE(list) == 0) {
2013 Py_DECREF(list);
2014 return joiner;
2015 }
2016
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002017#if PY_VERSION_HEX >= 0x01060000
2018 function = PyObject_GetAttrString(joiner, "join");
2019 if (!function) {
2020 Py_DECREF(joiner);
2021 return NULL;
2022 }
2023 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002024 if (!args) {
2025 Py_DECREF(function);
2026 Py_DECREF(joiner);
2027 return NULL;
2028 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002029 PyTuple_SET_ITEM(args, 0, list);
2030 result = PyObject_CallObject(function, args);
2031 Py_DECREF(args); /* also removes list */
2032 Py_DECREF(function);
2033#else
2034 result = call(
2035 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002036 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002037 );
2038#endif
2039 Py_DECREF(joiner);
2040
2041 return result;
2042}
2043
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002044static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002045pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002046{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002047 SRE_STATE state;
2048 PyObject* list;
2049 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002050 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002051
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002053 Py_ssize_t start = 0;
2054 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002055 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002056 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002057 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002059
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 string = state_init(&state, self, string, start, end);
2061 if (!string)
2062 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002065 if (!list) {
2066 state_fini(&state);
2067 return NULL;
2068 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002069
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002070 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002071
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002073
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002074 state_reset(&state);
2075
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002076 state.ptr = state.start;
2077
2078 if (state.charsize == 1) {
2079 status = sre_search(&state, PatternObject_GetCode(self));
2080 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002081#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002083#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002084 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002085
Thomas Wouters89f507f2006-12-13 04:49:30 +00002086 if (PyErr_Occurred())
2087 goto error;
2088
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002089 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002090 if (status == 0)
2091 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002092 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002093 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002094 }
Tim Peters3d563502006-01-21 02:47:53 +00002095
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002096 /* don't bother to build a match object */
2097 switch (self->groups) {
2098 case 0:
2099 b = STATE_OFFSET(&state, state.start);
2100 e = STATE_OFFSET(&state, state.ptr);
2101 item = PySequence_GetSlice(string, b, e);
2102 if (!item)
2103 goto error;
2104 break;
2105 case 1:
2106 item = state_getslice(&state, 1, string, 1);
2107 if (!item)
2108 goto error;
2109 break;
2110 default:
2111 item = PyTuple_New(self->groups);
2112 if (!item)
2113 goto error;
2114 for (i = 0; i < self->groups; i++) {
2115 PyObject* o = state_getslice(&state, i+1, string, 1);
2116 if (!o) {
2117 Py_DECREF(item);
2118 goto error;
2119 }
2120 PyTuple_SET_ITEM(item, i, o);
2121 }
2122 break;
2123 }
2124
2125 status = PyList_Append(list, item);
2126 Py_DECREF(item);
2127 if (status < 0)
2128 goto error;
2129
2130 if (state.ptr == state.start)
2131 state.start = (void*) ((char*) state.ptr + state.charsize);
2132 else
2133 state.start = state.ptr;
2134
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002136
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002137 state_fini(&state);
2138 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002139
2140error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002141 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002142 state_fini(&state);
2143 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002144
Guido van Rossumb700df92000-03-31 14:59:30 +00002145}
2146
Fredrik Lundh703ce812001-10-24 22:16:30 +00002147#if PY_VERSION_HEX >= 0x02020000
2148static PyObject*
2149pattern_finditer(PatternObject* pattern, PyObject* args)
2150{
2151 PyObject* scanner;
2152 PyObject* search;
2153 PyObject* iterator;
2154
2155 scanner = pattern_scanner(pattern, args);
2156 if (!scanner)
2157 return NULL;
2158
2159 search = PyObject_GetAttrString(scanner, "search");
2160 Py_DECREF(scanner);
2161 if (!search)
2162 return NULL;
2163
2164 iterator = PyCallIter_New(search, Py_None);
2165 Py_DECREF(search);
2166
2167 return iterator;
2168}
2169#endif
2170
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002171static PyObject*
2172pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2173{
2174 SRE_STATE state;
2175 PyObject* list;
2176 PyObject* item;
2177 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002178 Py_ssize_t n;
2179 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002180 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002181
2182 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002183 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002184 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002185 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002186 &string, &maxsplit))
2187 return NULL;
2188
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002189 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002190 if (!string)
2191 return NULL;
2192
2193 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002194 if (!list) {
2195 state_fini(&state);
2196 return NULL;
2197 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002198
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002199 n = 0;
2200 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002201
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002202 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002203
2204 state_reset(&state);
2205
2206 state.ptr = state.start;
2207
2208 if (state.charsize == 1) {
2209 status = sre_search(&state, PatternObject_GetCode(self));
2210 } else {
2211#if defined(HAVE_UNICODE)
2212 status = sre_usearch(&state, PatternObject_GetCode(self));
2213#endif
2214 }
2215
Thomas Wouters89f507f2006-12-13 04:49:30 +00002216 if (PyErr_Occurred())
2217 goto error;
2218
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002219 if (status <= 0) {
2220 if (status == 0)
2221 break;
2222 pattern_error(status);
2223 goto error;
2224 }
Tim Peters3d563502006-01-21 02:47:53 +00002225
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002226 if (state.start == state.ptr) {
2227 if (last == state.end)
2228 break;
2229 /* skip one character */
2230 state.start = (void*) ((char*) state.ptr + state.charsize);
2231 continue;
2232 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002233
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002234 /* get segment before this match */
2235 item = PySequence_GetSlice(
2236 string, STATE_OFFSET(&state, last),
2237 STATE_OFFSET(&state, state.start)
2238 );
2239 if (!item)
2240 goto error;
2241 status = PyList_Append(list, item);
2242 Py_DECREF(item);
2243 if (status < 0)
2244 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002245
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002246 /* add groups (if any) */
2247 for (i = 0; i < self->groups; i++) {
2248 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002249 if (!item)
2250 goto error;
2251 status = PyList_Append(list, item);
2252 Py_DECREF(item);
2253 if (status < 0)
2254 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002255 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002256
2257 n = n + 1;
2258
2259 last = state.start = state.ptr;
2260
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002261 }
2262
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002263 /* get segment following last match (even if empty) */
2264 item = PySequence_GetSlice(
2265 string, STATE_OFFSET(&state, last), state.endpos
2266 );
2267 if (!item)
2268 goto error;
2269 status = PyList_Append(list, item);
2270 Py_DECREF(item);
2271 if (status < 0)
2272 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002273
2274 state_fini(&state);
2275 return list;
2276
2277error:
2278 Py_DECREF(list);
2279 state_fini(&state);
2280 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002281
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002282}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002283
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002284static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002285pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002286 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002287{
2288 SRE_STATE state;
2289 PyObject* list;
2290 PyObject* item;
2291 PyObject* filter;
2292 PyObject* args;
2293 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002294 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002295 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002296 Py_ssize_t n;
2297 Py_ssize_t i, b, e;
2298 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002299 int filter_is_callable;
2300
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002301 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002302 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002303 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002304 Py_INCREF(filter);
2305 filter_is_callable = 1;
2306 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307 /* if not callable, check if it's a literal string */
2308 int literal;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002309 ptr = getstring(ptemplate, &n, &bint);
2310 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002311 if (ptr) {
2312 if (b == 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002313 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002314 } else {
2315#if defined(HAVE_UNICODE)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002316 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002317#endif
2318 }
2319 } else {
2320 PyErr_Clear();
2321 literal = 0;
2322 }
2323 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002324 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002325 Py_INCREF(filter);
2326 filter_is_callable = 0;
2327 } else {
2328 /* not a literal; hand it over to the template compiler */
2329 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002330 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002331 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002332 );
2333 if (!filter)
2334 return NULL;
2335 filter_is_callable = PyCallable_Check(filter);
2336 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002337 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002338
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002339 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002340 if (!string) {
2341 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002342 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002343 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002344
2345 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002346 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002347 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002348 state_fini(&state);
2349 return NULL;
2350 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002351
2352 n = i = 0;
2353
2354 while (!count || n < count) {
2355
2356 state_reset(&state);
2357
2358 state.ptr = state.start;
2359
2360 if (state.charsize == 1) {
2361 status = sre_search(&state, PatternObject_GetCode(self));
2362 } else {
2363#if defined(HAVE_UNICODE)
2364 status = sre_usearch(&state, PatternObject_GetCode(self));
2365#endif
2366 }
2367
Thomas Wouters89f507f2006-12-13 04:49:30 +00002368 if (PyErr_Occurred())
2369 goto error;
2370
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002371 if (status <= 0) {
2372 if (status == 0)
2373 break;
2374 pattern_error(status);
2375 goto error;
2376 }
Tim Peters3d563502006-01-21 02:47:53 +00002377
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002378 b = STATE_OFFSET(&state, state.start);
2379 e = STATE_OFFSET(&state, state.ptr);
2380
2381 if (i < b) {
2382 /* get segment before this match */
2383 item = PySequence_GetSlice(string, i, b);
2384 if (!item)
2385 goto error;
2386 status = PyList_Append(list, item);
2387 Py_DECREF(item);
2388 if (status < 0)
2389 goto error;
2390
2391 } else if (i == b && i == e && n > 0)
2392 /* ignore empty match on latest position */
2393 goto next;
2394
2395 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002396 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002397 match = pattern_new_match(self, &state, 1);
2398 if (!match)
2399 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002400 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002401 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002402 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002403 goto error;
2404 }
2405 item = PyObject_CallObject(filter, args);
2406 Py_DECREF(args);
2407 Py_DECREF(match);
2408 if (!item)
2409 goto error;
2410 } else {
2411 /* filter is literal string */
2412 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002413 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002414 }
2415
2416 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002417 if (item != Py_None) {
2418 status = PyList_Append(list, item);
2419 Py_DECREF(item);
2420 if (status < 0)
2421 goto error;
2422 }
Tim Peters3d563502006-01-21 02:47:53 +00002423
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002424 i = e;
2425 n = n + 1;
2426
2427next:
2428 /* move on */
2429 if (state.ptr == state.start)
2430 state.start = (void*) ((char*) state.ptr + state.charsize);
2431 else
2432 state.start = state.ptr;
2433
2434 }
2435
2436 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002437 if (i < state.endpos) {
2438 item = PySequence_GetSlice(string, i, state.endpos);
2439 if (!item)
2440 goto error;
2441 status = PyList_Append(list, item);
2442 Py_DECREF(item);
2443 if (status < 0)
2444 goto error;
2445 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002446
2447 state_fini(&state);
2448
Guido van Rossum4e173842001-12-07 04:25:10 +00002449 Py_DECREF(filter);
2450
Fredrik Lundhdac58492001-10-21 21:48:30 +00002451 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002452 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002453
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002454 if (!item)
2455 return NULL;
2456
2457 if (subn)
2458 return Py_BuildValue("Ni", item, n);
2459
2460 return item;
2461
2462error:
2463 Py_DECREF(list);
2464 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002465 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002467
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468}
2469
2470static PyObject*
2471pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2472{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002475 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002476 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002477 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002478 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002479 return NULL;
2480
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002481 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002482}
2483
2484static PyObject*
2485pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2486{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002487 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002488 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002489 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002490 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002491 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002492 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002493 return NULL;
2494
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002495 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002496}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002497
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002498static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002499pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002500{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002501#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002502 PatternObject* copy;
2503 int offset;
2504
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002505 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2506 if (!copy)
2507 return NULL;
2508
2509 offset = offsetof(PatternObject, groups);
2510
2511 Py_XINCREF(self->groupindex);
2512 Py_XINCREF(self->indexgroup);
2513 Py_XINCREF(self->pattern);
2514
2515 memcpy((char*) copy + offset, (char*) self + offset,
2516 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002517 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002518
2519 return (PyObject*) copy;
2520#else
2521 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2522 return NULL;
2523#endif
2524}
2525
2526static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002527pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002528{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002529#ifdef USE_BUILTIN_COPY
2530 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002531
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002532 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002533 if (!copy)
2534 return NULL;
2535
2536 if (!deepcopy(&copy->groupindex, memo) ||
2537 !deepcopy(&copy->indexgroup, memo) ||
2538 !deepcopy(&copy->pattern, memo)) {
2539 Py_DECREF(copy);
2540 return NULL;
2541 }
2542
2543#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002544 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2545 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002546#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002547}
2548
Raymond Hettinger94478742004-09-24 04:31:19 +00002549PyDoc_STRVAR(pattern_match_doc,
2550"match(string[, pos[, endpos]]) --> match object or None.\n\
2551 Matches zero or more characters at the beginning of the string");
2552
2553PyDoc_STRVAR(pattern_search_doc,
2554"search(string[, pos[, endpos]]) --> match object or None.\n\
2555 Scan through string looking for a match, and return a corresponding\n\
2556 MatchObject instance. Return None if no position in the string matches.");
2557
2558PyDoc_STRVAR(pattern_split_doc,
2559"split(string[, maxsplit = 0]) --> list.\n\
2560 Split string by the occurrences of pattern.");
2561
2562PyDoc_STRVAR(pattern_findall_doc,
2563"findall(string[, pos[, endpos]]) --> list.\n\
2564 Return a list of all non-overlapping matches of pattern in string.");
2565
2566PyDoc_STRVAR(pattern_finditer_doc,
2567"finditer(string[, pos[, endpos]]) --> iterator.\n\
2568 Return an iterator over all non-overlapping matches for the \n\
2569 RE pattern in string. For each match, the iterator returns a\n\
2570 match object.");
2571
2572PyDoc_STRVAR(pattern_sub_doc,
2573"sub(repl, string[, count = 0]) --> newstring\n\
2574 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002575 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002576
2577PyDoc_STRVAR(pattern_subn_doc,
2578"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2579 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2580 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002581 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002582
2583PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2584
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002585static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002586 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002588 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002589 pattern_search_doc},
2590 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2591 pattern_sub_doc},
2592 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2593 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002594 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002595 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002596 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002597 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002598#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002599 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2600 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002601#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002602 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002603 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2604 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002605 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002606};
2607
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002608#define PAT_OFF(x) offsetof(PatternObject, x)
2609static PyMemberDef pattern_members[] = {
2610 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2611 {"flags", T_INT, PAT_OFF(flags), READONLY},
2612 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2613 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2614 {NULL} /* Sentinel */
2615};
Guido van Rossumb700df92000-03-31 14:59:30 +00002616
Neal Norwitz57c179c2006-03-22 07:18:02 +00002617static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002618 PyVarObject_HEAD_INIT(NULL, 0)
2619 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002620 sizeof(PatternObject), sizeof(SRE_CODE),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002621 (destructor)pattern_dealloc, /* tp_dealloc */
2622 0, /* tp_print */
2623 0, /* tp_getattr */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002624 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00002625 0, /* tp_reserved */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002626 0, /* tp_repr */
2627 0, /* tp_as_number */
2628 0, /* tp_as_sequence */
2629 0, /* tp_as_mapping */
2630 0, /* tp_hash */
2631 0, /* tp_call */
2632 0, /* tp_str */
2633 0, /* tp_getattro */
2634 0, /* tp_setattro */
2635 0, /* tp_as_buffer */
Guido van Rossum3cf5b1e2006-07-27 21:53:35 +00002636 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002637 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002638 0, /* tp_traverse */
2639 0, /* tp_clear */
2640 0, /* tp_richcompare */
2641 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002642 0, /* tp_iter */
2643 0, /* tp_iternext */
2644 pattern_methods, /* tp_methods */
2645 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002646};
2647
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002648static int _validate(PatternObject *self); /* Forward */
2649
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002650static PyObject *
2651_compile(PyObject* self_, PyObject* args)
2652{
2653 /* "compile" pattern descriptor to pattern object */
2654
2655 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002656 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002657
2658 PyObject* pattern;
2659 int flags = 0;
2660 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002661 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002662 PyObject* groupindex = NULL;
2663 PyObject* indexgroup = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002664 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002665 &PyList_Type, &code, &groups,
2666 &groupindex, &indexgroup))
2667 return NULL;
2668
2669 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002670 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002671 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2672 if (!self)
2673 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002674 self->weakreflist = NULL;
2675 self->pattern = NULL;
2676 self->groupindex = NULL;
2677 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002678
2679 self->codesize = n;
2680
2681 for (i = 0; i < n; i++) {
2682 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002683 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002684 self->code[i] = (SRE_CODE) value;
2685 if ((unsigned long) self->code[i] != value) {
2686 PyErr_SetString(PyExc_OverflowError,
2687 "regular expression code size limit exceeded");
2688 break;
2689 }
2690 }
2691
2692 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002693 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002694 return NULL;
2695 }
2696
Antoine Pitroufd036452008-08-19 17:56:33 +00002697 if (pattern == Py_None)
2698 self->charsize = -1;
2699 else {
2700 Py_ssize_t p_length;
2701 if (!getstring(pattern, &p_length, &self->charsize)) {
Victor Stinner5abeafb2010-03-04 21:59:53 +00002702 Py_DECREF(self);
Antoine Pitroufd036452008-08-19 17:56:33 +00002703 return NULL;
2704 }
2705 }
2706
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002707 Py_INCREF(pattern);
2708 self->pattern = pattern;
2709
2710 self->flags = flags;
2711
2712 self->groups = groups;
2713
2714 Py_XINCREF(groupindex);
2715 self->groupindex = groupindex;
2716
2717 Py_XINCREF(indexgroup);
2718 self->indexgroup = indexgroup;
2719
2720 self->weakreflist = NULL;
2721
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002722 if (!_validate(self)) {
2723 Py_DECREF(self);
2724 return NULL;
2725 }
2726
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002727 return (PyObject*) self;
2728}
2729
Guido van Rossumb700df92000-03-31 14:59:30 +00002730/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002731/* Code validation */
2732
2733/* To learn more about this code, have a look at the _compile() function in
2734 Lib/sre_compile.py. The validation functions below checks the code array
2735 for conformance with the code patterns generated there.
2736
2737 The nice thing about the generated code is that it is position-independent:
2738 all jumps are relative jumps forward. Also, jumps don't cross each other:
2739 the target of a later jump is always earlier than the target of an earlier
2740 jump. IOW, this is okay:
2741
2742 J---------J-------T--------T
2743 \ \_____/ /
2744 \______________________/
2745
2746 but this is not:
2747
2748 J---------J-------T--------T
2749 \_________\_____/ /
2750 \____________/
2751
2752 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2753 bytes wide (the latter if Python is compiled for "wide" unicode support).
2754*/
2755
2756/* Defining this one enables tracing of the validator */
2757#undef VVERBOSE
2758
2759/* Trace macro for the validator */
2760#if defined(VVERBOSE)
2761#define VTRACE(v) printf v
2762#else
2763#define VTRACE(v)
2764#endif
2765
2766/* Report failure */
2767#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2768
2769/* Extract opcode, argument, or skip count from code array */
2770#define GET_OP \
2771 do { \
2772 VTRACE(("%p: ", code)); \
2773 if (code >= end) FAIL; \
2774 op = *code++; \
2775 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2776 } while (0)
2777#define GET_ARG \
2778 do { \
2779 VTRACE(("%p= ", code)); \
2780 if (code >= end) FAIL; \
2781 arg = *code++; \
2782 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2783 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002784#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002785 do { \
2786 VTRACE(("%p= ", code)); \
2787 if (code >= end) FAIL; \
2788 skip = *code; \
2789 VTRACE(("%lu (skip to %p)\n", \
2790 (unsigned long)skip, code+skip)); \
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002791 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002792 FAIL; \
2793 code++; \
2794 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002795#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002796
2797static int
2798_validate_charset(SRE_CODE *code, SRE_CODE *end)
2799{
2800 /* Some variables are manipulated by the macros above */
2801 SRE_CODE op;
2802 SRE_CODE arg;
2803 SRE_CODE offset;
2804 int i;
2805
2806 while (code < end) {
2807 GET_OP;
2808 switch (op) {
2809
2810 case SRE_OP_NEGATE:
2811 break;
2812
2813 case SRE_OP_LITERAL:
2814 GET_ARG;
2815 break;
2816
2817 case SRE_OP_RANGE:
2818 GET_ARG;
2819 GET_ARG;
2820 break;
2821
2822 case SRE_OP_CHARSET:
2823 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2824 if (code+offset < code || code+offset > end)
2825 FAIL;
2826 code += offset;
2827 break;
2828
2829 case SRE_OP_BIGCHARSET:
2830 GET_ARG; /* Number of blocks */
2831 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2832 if (code+offset < code || code+offset > end)
2833 FAIL;
2834 /* Make sure that each byte points to a valid block */
2835 for (i = 0; i < 256; i++) {
2836 if (((unsigned char *)code)[i] >= arg)
2837 FAIL;
2838 }
2839 code += offset;
2840 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2841 if (code+offset < code || code+offset > end)
2842 FAIL;
2843 code += offset;
2844 break;
2845
2846 case SRE_OP_CATEGORY:
2847 GET_ARG;
2848 switch (arg) {
2849 case SRE_CATEGORY_DIGIT:
2850 case SRE_CATEGORY_NOT_DIGIT:
2851 case SRE_CATEGORY_SPACE:
2852 case SRE_CATEGORY_NOT_SPACE:
2853 case SRE_CATEGORY_WORD:
2854 case SRE_CATEGORY_NOT_WORD:
2855 case SRE_CATEGORY_LINEBREAK:
2856 case SRE_CATEGORY_NOT_LINEBREAK:
2857 case SRE_CATEGORY_LOC_WORD:
2858 case SRE_CATEGORY_LOC_NOT_WORD:
2859 case SRE_CATEGORY_UNI_DIGIT:
2860 case SRE_CATEGORY_UNI_NOT_DIGIT:
2861 case SRE_CATEGORY_UNI_SPACE:
2862 case SRE_CATEGORY_UNI_NOT_SPACE:
2863 case SRE_CATEGORY_UNI_WORD:
2864 case SRE_CATEGORY_UNI_NOT_WORD:
2865 case SRE_CATEGORY_UNI_LINEBREAK:
2866 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2867 break;
2868 default:
2869 FAIL;
2870 }
2871 break;
2872
2873 default:
2874 FAIL;
2875
2876 }
2877 }
2878
2879 return 1;
2880}
2881
2882static int
2883_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2884{
2885 /* Some variables are manipulated by the macros above */
2886 SRE_CODE op;
2887 SRE_CODE arg;
2888 SRE_CODE skip;
2889
2890 VTRACE(("code=%p, end=%p\n", code, end));
2891
2892 if (code > end)
2893 FAIL;
2894
2895 while (code < end) {
2896 GET_OP;
2897 switch (op) {
2898
2899 case SRE_OP_MARK:
2900 /* We don't check whether marks are properly nested; the
2901 sre_match() code is robust even if they don't, and the worst
2902 you can get is nonsensical match results. */
2903 GET_ARG;
2904 if (arg > 2*groups+1) {
2905 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2906 FAIL;
2907 }
2908 break;
2909
2910 case SRE_OP_LITERAL:
2911 case SRE_OP_NOT_LITERAL:
2912 case SRE_OP_LITERAL_IGNORE:
2913 case SRE_OP_NOT_LITERAL_IGNORE:
2914 GET_ARG;
2915 /* The arg is just a character, nothing to check */
2916 break;
2917
2918 case SRE_OP_SUCCESS:
2919 case SRE_OP_FAILURE:
2920 /* Nothing to check; these normally end the matching process */
2921 break;
2922
2923 case SRE_OP_AT:
2924 GET_ARG;
2925 switch (arg) {
2926 case SRE_AT_BEGINNING:
2927 case SRE_AT_BEGINNING_STRING:
2928 case SRE_AT_BEGINNING_LINE:
2929 case SRE_AT_END:
2930 case SRE_AT_END_LINE:
2931 case SRE_AT_END_STRING:
2932 case SRE_AT_BOUNDARY:
2933 case SRE_AT_NON_BOUNDARY:
2934 case SRE_AT_LOC_BOUNDARY:
2935 case SRE_AT_LOC_NON_BOUNDARY:
2936 case SRE_AT_UNI_BOUNDARY:
2937 case SRE_AT_UNI_NON_BOUNDARY:
2938 break;
2939 default:
2940 FAIL;
2941 }
2942 break;
2943
2944 case SRE_OP_ANY:
2945 case SRE_OP_ANY_ALL:
2946 /* These have no operands */
2947 break;
2948
2949 case SRE_OP_IN:
2950 case SRE_OP_IN_IGNORE:
2951 GET_SKIP;
2952 /* Stop 1 before the end; we check the FAILURE below */
2953 if (!_validate_charset(code, code+skip-2))
2954 FAIL;
2955 if (code[skip-2] != SRE_OP_FAILURE)
2956 FAIL;
2957 code += skip-1;
2958 break;
2959
2960 case SRE_OP_INFO:
2961 {
2962 /* A minimal info field is
2963 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2964 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2965 more follows. */
2966 SRE_CODE flags, min, max, i;
2967 SRE_CODE *newcode;
2968 GET_SKIP;
2969 newcode = code+skip-1;
2970 GET_ARG; flags = arg;
2971 GET_ARG; min = arg;
2972 GET_ARG; max = arg;
2973 /* Check that only valid flags are present */
2974 if ((flags & ~(SRE_INFO_PREFIX |
2975 SRE_INFO_LITERAL |
2976 SRE_INFO_CHARSET)) != 0)
2977 FAIL;
2978 /* PREFIX and CHARSET are mutually exclusive */
2979 if ((flags & SRE_INFO_PREFIX) &&
2980 (flags & SRE_INFO_CHARSET))
2981 FAIL;
2982 /* LITERAL implies PREFIX */
2983 if ((flags & SRE_INFO_LITERAL) &&
2984 !(flags & SRE_INFO_PREFIX))
2985 FAIL;
2986 /* Validate the prefix */
2987 if (flags & SRE_INFO_PREFIX) {
2988 SRE_CODE prefix_len, prefix_skip;
2989 GET_ARG; prefix_len = arg;
2990 GET_ARG; prefix_skip = arg;
2991 /* Here comes the prefix string */
2992 if (code+prefix_len < code || code+prefix_len > newcode)
2993 FAIL;
2994 code += prefix_len;
2995 /* And here comes the overlap table */
2996 if (code+prefix_len < code || code+prefix_len > newcode)
2997 FAIL;
2998 /* Each overlap value should be < prefix_len */
2999 for (i = 0; i < prefix_len; i++) {
3000 if (code[i] >= prefix_len)
3001 FAIL;
3002 }
3003 code += prefix_len;
3004 }
3005 /* Validate the charset */
3006 if (flags & SRE_INFO_CHARSET) {
3007 if (!_validate_charset(code, newcode-1))
3008 FAIL;
3009 if (newcode[-1] != SRE_OP_FAILURE)
3010 FAIL;
3011 code = newcode;
3012 }
3013 else if (code != newcode) {
3014 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3015 FAIL;
3016 }
3017 }
3018 break;
3019
3020 case SRE_OP_BRANCH:
3021 {
3022 SRE_CODE *target = NULL;
3023 for (;;) {
3024 GET_SKIP;
3025 if (skip == 0)
3026 break;
3027 /* Stop 2 before the end; we check the JUMP below */
3028 if (!_validate_inner(code, code+skip-3, groups))
3029 FAIL;
3030 code += skip-3;
3031 /* Check that it ends with a JUMP, and that each JUMP
3032 has the same target */
3033 GET_OP;
3034 if (op != SRE_OP_JUMP)
3035 FAIL;
3036 GET_SKIP;
3037 if (target == NULL)
3038 target = code+skip-1;
3039 else if (code+skip-1 != target)
3040 FAIL;
3041 }
3042 }
3043 break;
3044
3045 case SRE_OP_REPEAT_ONE:
3046 case SRE_OP_MIN_REPEAT_ONE:
3047 {
3048 SRE_CODE min, max;
3049 GET_SKIP;
3050 GET_ARG; min = arg;
3051 GET_ARG; max = arg;
3052 if (min > max)
3053 FAIL;
3054#ifdef Py_UNICODE_WIDE
3055 if (max > 65535)
3056 FAIL;
3057#endif
3058 if (!_validate_inner(code, code+skip-4, groups))
3059 FAIL;
3060 code += skip-4;
3061 GET_OP;
3062 if (op != SRE_OP_SUCCESS)
3063 FAIL;
3064 }
3065 break;
3066
3067 case SRE_OP_REPEAT:
3068 {
3069 SRE_CODE min, max;
3070 GET_SKIP;
3071 GET_ARG; min = arg;
3072 GET_ARG; max = arg;
3073 if (min > max)
3074 FAIL;
3075#ifdef Py_UNICODE_WIDE
3076 if (max > 65535)
3077 FAIL;
3078#endif
3079 if (!_validate_inner(code, code+skip-3, groups))
3080 FAIL;
3081 code += skip-3;
3082 GET_OP;
3083 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3084 FAIL;
3085 }
3086 break;
3087
3088 case SRE_OP_GROUPREF:
3089 case SRE_OP_GROUPREF_IGNORE:
3090 GET_ARG;
3091 if (arg >= groups)
3092 FAIL;
3093 break;
3094
3095 case SRE_OP_GROUPREF_EXISTS:
3096 /* The regex syntax for this is: '(?(group)then|else)', where
3097 'group' is either an integer group number or a group name,
3098 'then' and 'else' are sub-regexes, and 'else' is optional. */
3099 GET_ARG;
3100 if (arg >= groups)
3101 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003102 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003103 code--; /* The skip is relative to the first arg! */
3104 /* There are two possibilities here: if there is both a 'then'
3105 part and an 'else' part, the generated code looks like:
3106
3107 GROUPREF_EXISTS
3108 <group>
3109 <skipyes>
3110 ...then part...
3111 JUMP
3112 <skipno>
3113 (<skipyes> jumps here)
3114 ...else part...
3115 (<skipno> jumps here)
3116
3117 If there is only a 'then' part, it looks like:
3118
3119 GROUPREF_EXISTS
3120 <group>
3121 <skip>
3122 ...then part...
3123 (<skip> jumps here)
3124
3125 There is no direct way to decide which it is, and we don't want
3126 to allow arbitrary jumps anywhere in the code; so we just look
3127 for a JUMP opcode preceding our skip target.
3128 */
3129 if (skip >= 3 && code+skip-3 >= code &&
3130 code[skip-3] == SRE_OP_JUMP)
3131 {
3132 VTRACE(("both then and else parts present\n"));
3133 if (!_validate_inner(code+1, code+skip-3, groups))
3134 FAIL;
3135 code += skip-2; /* Position after JUMP, at <skipno> */
3136 GET_SKIP;
3137 if (!_validate_inner(code, code+skip-1, groups))
3138 FAIL;
3139 code += skip-1;
3140 }
3141 else {
3142 VTRACE(("only a then part present\n"));
3143 if (!_validate_inner(code+1, code+skip-1, groups))
3144 FAIL;
3145 code += skip-1;
3146 }
3147 break;
3148
3149 case SRE_OP_ASSERT:
3150 case SRE_OP_ASSERT_NOT:
3151 GET_SKIP;
3152 GET_ARG; /* 0 for lookahead, width for lookbehind */
3153 code--; /* Back up over arg to simplify math below */
3154 if (arg & 0x80000000)
3155 FAIL; /* Width too large */
3156 /* Stop 1 before the end; we check the SUCCESS below */
3157 if (!_validate_inner(code+1, code+skip-2, groups))
3158 FAIL;
3159 code += skip-2;
3160 GET_OP;
3161 if (op != SRE_OP_SUCCESS)
3162 FAIL;
3163 break;
3164
3165 default:
3166 FAIL;
3167
3168 }
3169 }
3170
3171 VTRACE(("okay\n"));
3172 return 1;
3173}
3174
3175static int
3176_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3177{
3178 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3179 FAIL;
3180 if (groups == 0) /* fix for simplejson */
3181 groups = 100; /* 100 groups should always be safe */
3182 return _validate_inner(code, end-1, groups);
3183}
3184
3185static int
3186_validate(PatternObject *self)
3187{
3188 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3189 {
3190 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3191 return 0;
3192 }
3193 else
3194 VTRACE(("Success!\n"));
3195 return 1;
3196}
3197
3198/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003199/* match methods */
3200
3201static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003202match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003203{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003204 Py_XDECREF(self->regs);
3205 Py_XDECREF(self->string);
3206 Py_DECREF(self->pattern);
3207 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003208}
3209
3210static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003211match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003212{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003213 if (index < 0 || index >= self->groups) {
3214 /* raise IndexError if we were given a bad group number */
3215 PyErr_SetString(
3216 PyExc_IndexError,
3217 "no such group"
3218 );
3219 return NULL;
3220 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003221
Fredrik Lundh6f013982000-07-03 18:44:21 +00003222 index *= 2;
3223
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003224 if (self->string == Py_None || self->mark[index] < 0) {
3225 /* return default value if the string or group is undefined */
3226 Py_INCREF(def);
3227 return def;
3228 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003229
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003230 return PySequence_GetSlice(
3231 self->string, self->mark[index], self->mark[index+1]
3232 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003233}
3234
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003235static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003236match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003237{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003238 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003239
Guido van Rossumddefaf32007-01-14 03:31:43 +00003240 if (index == NULL)
3241 /* Default value */
3242 return 0;
3243
Christian Heimes217cfd12007-12-02 14:31:20 +00003244 if (PyLong_Check(index))
3245 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003246
Fredrik Lundh6f013982000-07-03 18:44:21 +00003247 i = -1;
3248
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003249 if (self->pattern->groupindex) {
3250 index = PyObject_GetItem(self->pattern->groupindex, index);
3251 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003252 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003253 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003254 Py_DECREF(index);
3255 } else
3256 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003257 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003258
3259 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003260}
3261
3262static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003263match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003264{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003265 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003266}
3267
3268static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003269match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003270{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003271 /* delegate to Python code */
3272 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003273 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003274 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003275 );
3276}
3277
3278static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003279match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003280{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003281 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003282 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003283
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003284 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003285
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003286 switch (size) {
3287 case 0:
3288 result = match_getslice(self, Py_False, Py_None);
3289 break;
3290 case 1:
3291 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3292 break;
3293 default:
3294 /* fetch multiple items */
3295 result = PyTuple_New(size);
3296 if (!result)
3297 return NULL;
3298 for (i = 0; i < size; i++) {
3299 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003300 self, PyTuple_GET_ITEM(args, i), Py_None
3301 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003302 if (!item) {
3303 Py_DECREF(result);
3304 return NULL;
3305 }
3306 PyTuple_SET_ITEM(result, i, item);
3307 }
3308 break;
3309 }
3310 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003311}
3312
3313static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003314match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003315{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003316 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003317 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003319 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003320 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003321 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003322 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003323
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003324 result = PyTuple_New(self->groups-1);
3325 if (!result)
3326 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003328 for (index = 1; index < self->groups; index++) {
3329 PyObject* item;
3330 item = match_getslice_by_index(self, index, def);
3331 if (!item) {
3332 Py_DECREF(result);
3333 return NULL;
3334 }
3335 PyTuple_SET_ITEM(result, index-1, item);
3336 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003337
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003339}
3340
3341static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003342match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003343{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 PyObject* result;
3345 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003346 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003349 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003350 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003353 result = PyDict_New();
3354 if (!result || !self->pattern->groupindex)
3355 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003356
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003358 if (!keys)
3359 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003361 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003362 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003363 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003364 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003366 if (!key)
3367 goto failed;
3368 value = match_getslice(self, key, def);
3369 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003370 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003371 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003372 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003373 status = PyDict_SetItem(result, key, value);
3374 Py_DECREF(value);
3375 if (status < 0)
3376 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003378
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003379 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003380
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003381 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003382
3383failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003384 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003385 Py_DECREF(result);
3386 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003387}
3388
3389static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003390match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003391{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003392 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003393
Guido van Rossumddefaf32007-01-14 03:31:43 +00003394 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003395 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003396 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003397
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003398 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003399
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003400 if (index < 0 || index >= self->groups) {
3401 PyErr_SetString(
3402 PyExc_IndexError,
3403 "no such group"
3404 );
3405 return NULL;
3406 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003407
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003408 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003409 return Py_BuildValue("i", self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003410}
3411
3412static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003413match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003414{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003415 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003416
Guido van Rossumddefaf32007-01-14 03:31:43 +00003417 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003418 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003419 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003420
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003421 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003422
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003423 if (index < 0 || index >= self->groups) {
3424 PyErr_SetString(
3425 PyExc_IndexError,
3426 "no such group"
3427 );
3428 return NULL;
3429 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003430
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003431 /* mark is -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003432 return Py_BuildValue("i", self->mark[index*2+1]);
3433}
3434
3435LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003436_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003437{
3438 PyObject* pair;
3439 PyObject* item;
3440
3441 pair = PyTuple_New(2);
3442 if (!pair)
3443 return NULL;
3444
Christian Heimes217cfd12007-12-02 14:31:20 +00003445 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003446 if (!item)
3447 goto error;
3448 PyTuple_SET_ITEM(pair, 0, item);
3449
Christian Heimes217cfd12007-12-02 14:31:20 +00003450 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003451 if (!item)
3452 goto error;
3453 PyTuple_SET_ITEM(pair, 1, item);
3454
3455 return pair;
3456
3457 error:
3458 Py_DECREF(pair);
3459 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003460}
3461
3462static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003463match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003464{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003465 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003466
Guido van Rossumddefaf32007-01-14 03:31:43 +00003467 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003468 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003469 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003470
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003471 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003472
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003473 if (index < 0 || index >= self->groups) {
3474 PyErr_SetString(
3475 PyExc_IndexError,
3476 "no such group"
3477 );
3478 return NULL;
3479 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003480
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003481 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003482 return _pair(self->mark[index*2], self->mark[index*2+1]);
3483}
3484
3485static PyObject*
3486match_regs(MatchObject* self)
3487{
3488 PyObject* regs;
3489 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003490 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003491
3492 regs = PyTuple_New(self->groups);
3493 if (!regs)
3494 return NULL;
3495
3496 for (index = 0; index < self->groups; index++) {
3497 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3498 if (!item) {
3499 Py_DECREF(regs);
3500 return NULL;
3501 }
3502 PyTuple_SET_ITEM(regs, index, item);
3503 }
3504
3505 Py_INCREF(regs);
3506 self->regs = regs;
3507
3508 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003509}
3510
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003511static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003512match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003513{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003514#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003515 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003516 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003517
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003518 slots = 2 * (self->pattern->groups+1);
3519
3520 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3521 if (!copy)
3522 return NULL;
3523
3524 /* this value a constant, but any compiler should be able to
3525 figure that out all by itself */
3526 offset = offsetof(MatchObject, string);
3527
3528 Py_XINCREF(self->pattern);
3529 Py_XINCREF(self->string);
3530 Py_XINCREF(self->regs);
3531
3532 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003533 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003534
3535 return (PyObject*) copy;
3536#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003537 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003538 return NULL;
3539#endif
3540}
3541
3542static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003543match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003544{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003545#ifdef USE_BUILTIN_COPY
3546 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003547
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003548 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003549 if (!copy)
3550 return NULL;
3551
3552 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3553 !deepcopy(&copy->string, memo) ||
3554 !deepcopy(&copy->regs, memo)) {
3555 Py_DECREF(copy);
3556 return NULL;
3557 }
3558
3559#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003560 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3561 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003562#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003563}
3564
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003565static PyMethodDef match_methods[] = {
Fredrik Lundh562586e2000-10-03 20:43:34 +00003566 {"group", (PyCFunction) match_group, METH_VARARGS},
3567 {"start", (PyCFunction) match_start, METH_VARARGS},
3568 {"end", (PyCFunction) match_end, METH_VARARGS},
3569 {"span", (PyCFunction) match_span, METH_VARARGS},
3570 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3571 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003572 {"expand", (PyCFunction) match_expand, METH_O},
3573 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3574 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003575 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003576};
3577
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003578static PyObject *
3579match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003580{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003581 if (self->lastindex >= 0)
3582 return Py_BuildValue("i", self->lastindex);
3583 Py_INCREF(Py_None);
3584 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003585}
3586
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003587static PyObject *
3588match_lastgroup_get(MatchObject *self)
3589{
3590 if (self->pattern->indexgroup && self->lastindex >= 0) {
3591 PyObject* result = PySequence_GetItem(
3592 self->pattern->indexgroup, self->lastindex
3593 );
3594 if (result)
3595 return result;
3596 PyErr_Clear();
3597 }
3598 Py_INCREF(Py_None);
3599 return Py_None;
3600}
3601
3602static PyObject *
3603match_regs_get(MatchObject *self)
3604{
3605 if (self->regs) {
3606 Py_INCREF(self->regs);
3607 return self->regs;
3608 } else
3609 return match_regs(self);
3610}
3611
3612static PyGetSetDef match_getset[] = {
3613 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3614 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3615 {"regs", (getter)match_regs_get, (setter)NULL},
3616 {NULL}
3617};
3618
3619#define MATCH_OFF(x) offsetof(MatchObject, x)
3620static PyMemberDef match_members[] = {
3621 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3622 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3623 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3624 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3625 {NULL}
3626};
3627
Guido van Rossumb700df92000-03-31 14:59:30 +00003628/* FIXME: implement setattr("string", None) as a special case (to
3629 detach the associated string, if any */
3630
Neal Norwitz57c179c2006-03-22 07:18:02 +00003631static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003632 PyVarObject_HEAD_INIT(NULL,0)
3633 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003634 sizeof(MatchObject), sizeof(Py_ssize_t),
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003635 (destructor)match_dealloc, /* tp_dealloc */
3636 0, /* tp_print */
3637 0, /* tp_getattr */
3638 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003639 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003640 0, /* tp_repr */
3641 0, /* tp_as_number */
3642 0, /* tp_as_sequence */
3643 0, /* tp_as_mapping */
3644 0, /* tp_hash */
3645 0, /* tp_call */
3646 0, /* tp_str */
3647 0, /* tp_getattro */
3648 0, /* tp_setattro */
3649 0, /* tp_as_buffer */
3650 Py_TPFLAGS_DEFAULT, /* tp_flags */
3651 0, /* tp_doc */
3652 0, /* tp_traverse */
3653 0, /* tp_clear */
3654 0, /* tp_richcompare */
3655 0, /* tp_weaklistoffset */
3656 0, /* tp_iter */
3657 0, /* tp_iternext */
3658 match_methods, /* tp_methods */
3659 match_members, /* tp_members */
3660 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003661};
3662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003663static PyObject*
3664pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3665{
3666 /* create match object (from state object) */
3667
3668 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003669 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003670 char* base;
3671 int n;
3672
3673 if (status > 0) {
3674
3675 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003676 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003677 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3678 2*(pattern->groups+1));
3679 if (!match)
3680 return NULL;
3681
3682 Py_INCREF(pattern);
3683 match->pattern = pattern;
3684
3685 Py_INCREF(state->string);
3686 match->string = state->string;
3687
3688 match->regs = NULL;
3689 match->groups = pattern->groups+1;
3690
3691 /* fill in group slices */
3692
3693 base = (char*) state->beginning;
3694 n = state->charsize;
3695
3696 match->mark[0] = ((char*) state->start - base) / n;
3697 match->mark[1] = ((char*) state->ptr - base) / n;
3698
3699 for (i = j = 0; i < pattern->groups; i++, j+=2)
3700 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3701 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3702 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3703 } else
3704 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3705
3706 match->pos = state->pos;
3707 match->endpos = state->endpos;
3708
3709 match->lastindex = state->lastindex;
3710
3711 return (PyObject*) match;
3712
3713 } else if (status == 0) {
3714
3715 /* no match */
3716 Py_INCREF(Py_None);
3717 return Py_None;
3718
3719 }
3720
3721 /* internal error */
3722 pattern_error(status);
3723 return NULL;
3724}
3725
3726
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003727/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003728/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003729
3730static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003731scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003732{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003733 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003734 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003735 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003736}
3737
3738static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003739scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003740{
3741 SRE_STATE* state = &self->state;
3742 PyObject* match;
3743 int status;
3744
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003745 state_reset(state);
3746
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003747 state->ptr = state->start;
3748
3749 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003750 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003751 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003752#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003753 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003754#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003755 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003756 if (PyErr_Occurred())
3757 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003758
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003759 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003760 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003761
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003762 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003763 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003764 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003765 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003766
3767 return match;
3768}
3769
3770
3771static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003772scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003773{
3774 SRE_STATE* state = &self->state;
3775 PyObject* match;
3776 int status;
3777
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003778 state_reset(state);
3779
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003780 state->ptr = state->start;
3781
3782 if (state->charsize == 1) {
3783 status = sre_search(state, PatternObject_GetCode(self->pattern));
3784 } else {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003785#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003786 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003787#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003788 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003789 if (PyErr_Occurred())
3790 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003791
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003792 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003793 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003794
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003795 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003796 state->start = (void*) ((char*) state->ptr + state->charsize);
3797 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003798 state->start = state->ptr;
3799
3800 return match;
3801}
3802
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003803static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003804 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3805 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003806 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003807};
3808
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003809#define SCAN_OFF(x) offsetof(ScannerObject, x)
3810static PyMemberDef scanner_members[] = {
3811 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3812 {NULL} /* Sentinel */
3813};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003814
Neal Norwitz57c179c2006-03-22 07:18:02 +00003815static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003816 PyVarObject_HEAD_INIT(NULL, 0)
3817 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003818 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003819 (destructor)scanner_dealloc,/* tp_dealloc */
3820 0, /* tp_print */
3821 0, /* tp_getattr */
3822 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00003823 0, /* tp_reserved */
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003824 0, /* tp_repr */
3825 0, /* tp_as_number */
3826 0, /* tp_as_sequence */
3827 0, /* tp_as_mapping */
3828 0, /* tp_hash */
3829 0, /* tp_call */
3830 0, /* tp_str */
3831 0, /* tp_getattro */
3832 0, /* tp_setattro */
3833 0, /* tp_as_buffer */
3834 Py_TPFLAGS_DEFAULT, /* tp_flags */
3835 0, /* tp_doc */
3836 0, /* tp_traverse */
3837 0, /* tp_clear */
3838 0, /* tp_richcompare */
3839 0, /* tp_weaklistoffset */
3840 0, /* tp_iter */
3841 0, /* tp_iternext */
3842 scanner_methods, /* tp_methods */
3843 scanner_members, /* tp_members */
3844 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003845};
3846
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003847static PyObject*
3848pattern_scanner(PatternObject* pattern, PyObject* args)
3849{
3850 /* create search state object */
3851
3852 ScannerObject* self;
3853
3854 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003855 Py_ssize_t start = 0;
3856 Py_ssize_t end = PY_SSIZE_T_MAX;
3857 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003858 return NULL;
3859
3860 /* create scanner object */
3861 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3862 if (!self)
3863 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003864 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003865
3866 string = state_init(&self->state, pattern, string, start, end);
3867 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003868 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003869 return NULL;
3870 }
3871
3872 Py_INCREF(pattern);
3873 self->pattern = (PyObject*) pattern;
3874
3875 return (PyObject*) self;
3876}
3877
Guido van Rossumb700df92000-03-31 14:59:30 +00003878static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003879 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003880 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003881 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003882 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003883};
3884
Martin v. Löwis1a214512008-06-11 05:26:20 +00003885static struct PyModuleDef sremodule = {
3886 PyModuleDef_HEAD_INIT,
3887 "_" SRE_MODULE,
3888 NULL,
3889 -1,
3890 _functions,
3891 NULL,
3892 NULL,
3893 NULL,
3894 NULL
3895};
3896
3897PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003898{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003899 PyObject* m;
3900 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003901 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003902
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003903 /* Patch object types */
3904 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3905 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003906 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003907
Martin v. Löwis1a214512008-06-11 05:26:20 +00003908 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003909 if (m == NULL)
Martin v. Löwis1a214512008-06-11 05:26:20 +00003910 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003911 d = PyModule_GetDict(m);
3912
Christian Heimes217cfd12007-12-02 14:31:20 +00003913 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003914 if (x) {
3915 PyDict_SetItemString(d, "MAGIC", x);
3916 Py_DECREF(x);
3917 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003918
Christian Heimes217cfd12007-12-02 14:31:20 +00003919 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003920 if (x) {
3921 PyDict_SetItemString(d, "CODESIZE", x);
3922 Py_DECREF(x);
3923 }
3924
Neal Norwitzfe537132007-08-26 03:55:15 +00003925 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003926 if (x) {
3927 PyDict_SetItemString(d, "copyright", x);
3928 Py_DECREF(x);
3929 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003930 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003931}
3932
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003933#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003934
3935/* vim:ts=4:sw=4:et
3936*/