blob: b115e2b3fe08dd201682177b27b93de2a39b2c64 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Neal Norwitza6d80fa2006-06-12 03:05:40 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d52000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000055
Neal Norwitz94a9c092006-03-16 06:30:02 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#if PY_VERSION_HEX >= 0x01060000
62#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000063/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000064#define HAVE_UNICODE
65#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000066#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070
71/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000072#define USE_FAST_SEARCH
73
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000075#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000076
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000077/* enables copy/deepcopy handling (work in progress) */
78#undef USE_BUILTIN_COPY
79
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000080#if PY_VERSION_HEX < 0x01060000
81#define PyObject_DEL(op) PyMem_DEL((op))
82#endif
83
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084/* -------------------------------------------------------------------- */
85
Fredrik Lundh80946112000-06-29 18:03:25 +000086#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000087#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000088#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000089/* fastest possible local call under MSVC */
90#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000092#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#else
94#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000095#endif
96
97/* error codes */
98#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000099#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000100#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define SRE_ERROR_MEMORY -9 /* out of memory */
Facundo Batista4473d222008-01-08 21:10:12 +0000102#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +0000103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000104#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000105#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000106#else
107#define TRACE(v)
108#endif
109
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000110/* -------------------------------------------------------------------- */
111/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000112
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000113/* default character predicates (run sre_chars.py to regenerate tables) */
114
115#define SRE_DIGIT_MASK 1
116#define SRE_SPACE_MASK 2
117#define SRE_LINEBREAK_MASK 4
118#define SRE_ALNUM_MASK 8
119#define SRE_WORD_MASK 16
120
Fredrik Lundh21009b92001-09-18 18:47:09 +0000121/* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000123static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1242, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1280, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
Fredrik Lundhb389df32000-06-29 12:48:37 +0000131static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000013210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139120, 121, 122, 123, 124, 125, 126, 127 };
140
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000141#define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143#define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145#define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147#define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149#define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000151
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000152static unsigned int sre_lower(unsigned int ch)
153{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155}
156
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000162#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000164#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166static unsigned int sre_lower_locale(unsigned int ch)
167{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169}
170
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000171/* unicode-specific character predicates */
172
173#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000174
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000175#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000176#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000178#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000179#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000180
181static unsigned int sre_lower_unicode(unsigned int ch)
182{
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184}
185
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000186#endif
187
Guido van Rossumb700df92000-03-31 14:59:30 +0000188LOCAL(int)
189sre_category(SRE_CODE category, unsigned int ch)
190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000191 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000209
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000214
215#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000232#else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000249#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 }
251 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000252}
253
254/* helpers */
255
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000256static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000258{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 if (state->data_stack) {
Jack Diederich2d400772006-05-27 15:44:34 +0000260 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000261 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264}
265
266static int
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000267data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000268{
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000269 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
275 TRACE(("allocate/grow stack %d\n", cursize));
Jack Diederich2d400772006-05-27 15:44:34 +0000276 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000277 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279 return SRE_ERROR_MEMORY;
280 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000281 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000282 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000283 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000284 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000285}
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#define SRE_CHAR unsigned char
290#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_charset
293#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000297#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000298
299#if defined(HAVE_UNICODE)
300
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000303#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000304
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000305#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000306#undef SRE_SEARCH
307#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000308#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000309#undef SRE_INFO
310#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000311#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000312#undef SRE_AT
313#undef SRE_CHAR
314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000316
317#define SRE_CHAR Py_UNICODE
318#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000319#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000320#define SRE_CHARSET sre_ucharset
321#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000322#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000323#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000324#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000325#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000326#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000327
328#endif /* SRE_RECURSIVE */
329
330/* -------------------------------------------------------------------- */
331/* String matching engine */
332
333/* the following section is compiled twice, with different character
334 settings */
335
336LOCAL(int)
337SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000340
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000341 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000346 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000351 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000354 return (((void*) (ptr+1) == state->end &&
355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000360 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000361
Fredrik Lundh770617b2001-01-14 15:06:11 +0000362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000368 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000370 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000372 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000377 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000378 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000379 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000380 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000381 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000386 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000388 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000390 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000395 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000397 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000399 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000400
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000401#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000405 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000407 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000409 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000414 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000416 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000418 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000419#endif
420
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000421 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000422
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000423 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000424}
425
426LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000427SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000428{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000431 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000433 for (;;) {
434 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_FAILURE:
437 return !ok;
438
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000439 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000440 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000449 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000450 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000451 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000452
Fredrik Lundh3562f112000-07-02 12:00:07 +0000453 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000459 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith64ab35e2012-12-10 17:45:54 -0800462 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000463 return ok;
464 set += 8;
465 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000466 break;
467
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000482 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000483 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000500 if (block >=0 &&
Gregory P. Smith64ab35e2012-12-10 17:45:54 -0800501 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000502 return ok;
503 set += count*8;
504 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000505 break;
506 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000507
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000508 default:
509 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000510 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000511 return 0;
512 }
513 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000514}
515
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000516LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000518LOCAL(Py_ssize_t)
519SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000520{
521 SRE_CODE chr;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000524 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000525
526 /* adjust end */
527 if (maxcount < end - ptr && maxcount != 65535)
528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000547 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
Tim Peters3d563502006-01-21 02:47:53 +0000576
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 default:
586 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000588 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000589 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
596 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000600 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000601 return ptr - (SRE_CHAR*) state->ptr;
602}
603
Fredrik Lundh33accc12000-08-27 20:59:47 +0000604#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000605LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000606SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607{
608 /* check if an SRE_OP_INFO block matches at the current position.
609 returns the number of SRE_CODE objects to skip if successful, 0
610 if no match */
611
612 SRE_CHAR* end = state->end;
613 SRE_CHAR* ptr = state->ptr;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000614 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000615
616 /* check minimal length */
617 if (pattern[3] && (end - ptr) < pattern[3])
618 return 0;
619
620 /* check known prefix */
621 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
622 /* <length> <skip> <prefix data> <overlap data> */
623 for (i = 0; i < pattern[5]; i++)
624 if ((SRE_CODE) ptr[i] != pattern[7 + i])
625 return 0;
626 return pattern[0] + 2 * pattern[6];
627 }
628 return pattern[0];
629}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000630#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000631
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000632/* The macros below should be used to protect recursive SRE_MATCH()
633 * calls that *failed* and do *not* return immediately (IOW, those
634 * that will backtrack). Explaining:
635 *
636 * - Recursive SRE_MATCH() returned true: that's usually a success
637 * (besides atypical cases like ASSERT_NOT), therefore there's no
638 * reason to restore lastmark;
639 *
640 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
641 * is returning to the caller: If the current SRE_MATCH() is the
642 * top function of the recursion, returning false will be a matching
643 * failure, and it doesn't matter where lastmark is pointing to.
644 * If it's *not* the top function, it will be a recursive SRE_MATCH()
645 * failure by itself, and the calling SRE_MATCH() will have to deal
646 * with the failure by the same rules explained here (it will restore
647 * lastmark by itself if necessary);
648 *
649 * - Recursive SRE_MATCH() returned false, and will continue the
650 * outside 'for' loop: must be protected when breaking, since the next
651 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000652 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000653 * - Recursive SRE_MATCH() returned false, and will be called again
654 * inside a local for/while loop: must be protected between each
655 * loop iteration, since the recursive SRE_MATCH() could do anything,
656 * and could potentially depend on lastmark.
657 *
658 * For more information, check the discussion at SF patch #712900.
659 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000660#define LASTMARK_SAVE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 ctx->lastmark = state->lastmark; \
663 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665#define LASTMARK_RESTORE() \
666 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000667 state->lastmark = ctx->lastmark; \
668 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000669 } while (0)
670
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000671#define RETURN_ERROR(i) do { return i; } while(0)
672#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
673#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674
675#define RETURN_ON_ERROR(i) \
676 do { if (i < 0) RETURN_ERROR(i); } while (0)
677#define RETURN_ON_SUCCESS(i) \
678 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
679#define RETURN_ON_FAILURE(i) \
680 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681
682#define SFY(x) #x
683
684#define DATA_STACK_ALLOC(state, type, ptr) \
685do { \
686 alloc_pos = state->data_stack_base; \
687 TRACE(("allocating %s in %d (%d)\n", \
688 SFY(type), alloc_pos, sizeof(type))); \
689 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
690 int j = data_stack_grow(state, sizeof(type)); \
691 if (j < 0) return j; \
692 if (ctx_pos != -1) \
693 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 } \
695 ptr = (type*)(state->data_stack+alloc_pos); \
696 state->data_stack_base += sizeof(type); \
697} while (0)
698
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000699#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
700do { \
701 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
702 ptr = (type*)(state->data_stack+pos); \
703} while (0)
704
705#define DATA_STACK_PUSH(state, data, size) \
706do { \
707 TRACE(("copy data in %p to %d (%d)\n", \
708 data, state->data_stack_base, size)); \
709 if (state->data_stack_size < state->data_stack_base+size) { \
710 int j = data_stack_grow(state, size); \
711 if (j < 0) return j; \
712 if (ctx_pos != -1) \
713 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 } \
715 memcpy(state->data_stack+state->data_stack_base, data, size); \
716 state->data_stack_base += size; \
717} while (0)
718
719#define DATA_STACK_POP(state, data, size, discard) \
720do { \
721 TRACE(("copy data to %p from %d (%d)\n", \
722 data, state->data_stack_base-size, size)); \
723 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
724 if (discard) \
725 state->data_stack_base -= size; \
726} while (0)
727
728#define DATA_STACK_POP_DISCARD(state, size) \
729do { \
730 TRACE(("discard data from %d (%d)\n", \
731 state->data_stack_base-size, size)); \
732 state->data_stack_base -= size; \
733} while(0)
734
735#define DATA_PUSH(x) \
736 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
737#define DATA_POP(x) \
738 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000739#define DATA_POP_DISCARD(x) \
740 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
741#define DATA_ALLOC(t,p) \
742 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000743#define DATA_LOOKUP_AT(t,p,pos) \
744 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745
746#define MARK_PUSH(lastmark) \
747 do if (lastmark > 0) { \
748 i = lastmark; /* ctx->lastmark may change if reallocated */ \
749 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
750 } while (0)
751#define MARK_POP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
754 } while (0)
755#define MARK_POP_KEEP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
758 } while (0)
759#define MARK_POP_DISCARD(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
762 } while (0)
763
764#define JUMP_NONE 0
765#define JUMP_MAX_UNTIL_1 1
766#define JUMP_MAX_UNTIL_2 2
767#define JUMP_MAX_UNTIL_3 3
768#define JUMP_MIN_UNTIL_1 4
769#define JUMP_MIN_UNTIL_2 5
770#define JUMP_MIN_UNTIL_3 6
771#define JUMP_REPEAT 7
772#define JUMP_REPEAT_ONE_1 8
773#define JUMP_REPEAT_ONE_2 9
774#define JUMP_MIN_REPEAT_ONE 10
775#define JUMP_BRANCH 11
776#define JUMP_ASSERT 12
777#define JUMP_ASSERT_NOT 13
778
779#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
780 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
781 nextctx->last_ctx_pos = ctx_pos; \
782 nextctx->jump = jumpvalue; \
783 nextctx->pattern = nextpattern; \
784 ctx_pos = alloc_pos; \
785 ctx = nextctx; \
786 goto entrance; \
787 jumplabel: \
788 while (0) /* gcc doesn't like labels at end of scopes */ \
789
790typedef struct {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000791 Py_ssize_t last_ctx_pos;
792 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000793 SRE_CHAR* ptr;
794 SRE_CODE* pattern;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000795 Py_ssize_t count;
796 Py_ssize_t lastmark;
797 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 union {
799 SRE_CODE chr;
800 SRE_REPEAT* rep;
801 } u;
802} SRE_MATCH_CONTEXT;
803
804/* check if string matches the given pattern. returns <0 for
805 error, 0 for failure, and 1 for success */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000806LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000807SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000808{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000809 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000810 Py_ssize_t alloc_pos, ctx_pos = -1;
811 Py_ssize_t i, ret = 0;
812 Py_ssize_t jump;
Facundo Batista4473d222008-01-08 21:10:12 +0000813 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000815 SRE_MATCH_CONTEXT* ctx;
816 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000817
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000818 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
821 ctx->last_ctx_pos = -1;
822 ctx->jump = JUMP_NONE;
823 ctx->pattern = pattern;
824 ctx_pos = alloc_pos;
825
826entrance:
827
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000828 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829
830 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000832 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000833 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000834 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000835 (end - ctx->ptr), ctx->pattern[3]));
836 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000837 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000839 }
840
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000841 for (;;) {
Facundo Batista4473d222008-01-08 21:10:12 +0000842 ++sigcount;
843 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
844 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000845
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000846 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000847
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000848 case SRE_OP_MARK:
849 /* set mark */
850 /* <MARK> <gid> */
851 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
852 ctx->ptr, ctx->pattern[0]));
853 i = ctx->pattern[0];
854 if (i & 1)
855 state->lastindex = i/2 + 1;
856 if (i > state->lastmark) {
857 /* state->lastmark is the highest valid index in the
858 state->mark array. If it is increased by more than 1,
859 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000860 that these marks have not been encountered. */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000861 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000862 while (j < i)
863 state->mark[j++] = NULL;
864 state->lastmark = i;
865 }
866 state->mark[i] = ctx->ptr;
867 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000868 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 case SRE_OP_LITERAL:
871 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000872 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000873 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
874 ctx->ptr, *ctx->pattern));
875 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
876 RETURN_FAILURE;
877 ctx->pattern++;
878 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000879 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000881 case SRE_OP_NOT_LITERAL:
882 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000883 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000884 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
885 ctx->ptr, *ctx->pattern));
886 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
887 RETURN_FAILURE;
888 ctx->pattern++;
889 ctx->ptr++;
890 break;
891
892 case SRE_OP_SUCCESS:
893 /* end of pattern */
894 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
895 state->ptr = ctx->ptr;
896 RETURN_SUCCESS;
897
898 case SRE_OP_AT:
899 /* match at given position */
900 /* <AT> <code> */
901 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
902 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
903 RETURN_FAILURE;
904 ctx->pattern++;
905 break;
906
907 case SRE_OP_CATEGORY:
908 /* match at given category */
909 /* <CATEGORY> <code> */
910 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
911 ctx->ptr, *ctx->pattern));
912 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
913 RETURN_FAILURE;
914 ctx->pattern++;
915 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000916 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000918 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000919 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000920 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000925 break;
926
927 case SRE_OP_ANY_ALL:
928 /* match anything */
929 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end)
932 RETURN_FAILURE;
933 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000934 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000936 case SRE_OP_IN:
937 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000938 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000939 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
940 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
941 RETURN_FAILURE;
942 ctx->pattern += ctx->pattern[0];
943 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000944 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000945
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000946 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000947 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
948 ctx->pattern, ctx->ptr, ctx->pattern[0]));
949 if (ctx->ptr >= end ||
950 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
951 RETURN_FAILURE;
952 ctx->pattern++;
953 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000954 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000955
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000956 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000957 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
958 ctx->pattern, ctx->ptr, *ctx->pattern));
959 if (ctx->ptr >= end ||
960 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
961 RETURN_FAILURE;
962 ctx->pattern++;
963 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000964 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000966 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000967 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
968 if (ctx->ptr >= end
969 || !SRE_CHARSET(ctx->pattern+1,
970 (SRE_CODE)state->lower(*ctx->ptr)))
971 RETURN_FAILURE;
972 ctx->pattern += ctx->pattern[0];
973 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000975
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 case SRE_OP_JUMP:
977 case SRE_OP_INFO:
978 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000980 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
981 ctx->ptr, ctx->pattern[0]));
982 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000983 break;
984
985 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000986 /* alternation */
987 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000988 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000989 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000990 ctx->u.rep = state->repeat;
991 if (ctx->u.rep)
992 MARK_PUSH(ctx->lastmark);
993 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
994 if (ctx->pattern[1] == SRE_OP_LITERAL &&
995 (ctx->ptr >= end ||
996 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000997 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 if (ctx->pattern[1] == SRE_OP_IN &&
999 (ctx->ptr >= end ||
1000 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001001 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001002 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 if (ret) {
1005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_ON_ERROR(ret);
1008 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001009 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ctx->u.rep)
1011 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001012 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001013 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001014 if (ctx->u.rep)
1015 MARK_POP_DISCARD(ctx->lastmark);
1016 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001017
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018 case SRE_OP_REPEAT_ONE:
1019 /* match repeated sequence (maximizing regexp) */
1020
1021 /* this operator only works if the repeated item is
1022 exactly one character wide, and we're not already
1023 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001024 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001025
1026 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1027
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001028 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1029 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001030
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 if (ctx->ptr + ctx->pattern[1] > end)
1032 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001036 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1037 RETURN_ON_ERROR(ret);
1038 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1039 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001041
1042 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 string. check if the rest of the pattern matches,
1045 and backtrack if not. */
1046
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001047 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001049
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001050 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 state->ptr = ctx->ptr;
1053 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001054 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001056 LASTMARK_SAVE();
1057
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 /* tail starts with a literal. skip positions where
1060 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 for (;;) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001063 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1065 ctx->ptr--;
1066 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001067 }
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001068 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001069 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1072 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 if (ret) {
1074 RETURN_ON_ERROR(ret);
1075 RETURN_SUCCESS;
1076 }
Tim Peters3d563502006-01-21 02:47:53 +00001077
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001078 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001079
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 ctx->ptr--;
1081 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001082 }
1083
1084 } else {
1085 /* general case */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001086 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001087 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1089 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 if (ret) {
1091 RETURN_ON_ERROR(ret);
1092 RETURN_SUCCESS;
1093 }
1094 ctx->ptr--;
1095 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001096 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001097 }
1098 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001099 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001100
Guido van Rossum41c99e72003-04-14 17:59:34 +00001101 case SRE_OP_MIN_REPEAT_ONE:
1102 /* match repeated sequence (minimizing regexp) */
1103
1104 /* this operator only works if the repeated item is
1105 exactly one character wide, and we're not already
1106 collecting backtracking points. for other cases,
1107 use the MIN_REPEAT operator */
1108
1109 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1110
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1112 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001113
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001114 if (ctx->ptr + ctx->pattern[1] > end)
1115 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001116
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001118
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 if (ctx->pattern[1] == 0)
1120 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001121 else {
1122 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1124 RETURN_ON_ERROR(ret);
1125 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001126 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001127 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001128 RETURN_FAILURE;
1129 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001130 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001131 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001132 }
1133
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001135 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 state->ptr = ctx->ptr;
1137 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001138
1139 } else {
1140 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001141 LASTMARK_SAVE();
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001142 while ((Py_ssize_t)ctx->pattern[2] == 65535
1143 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001145 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1146 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001147 if (ret) {
1148 RETURN_ON_ERROR(ret);
1149 RETURN_SUCCESS;
1150 }
1151 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001152 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001154 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001156 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 assert(ret == 1);
1158 ctx->ptr++;
1159 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001160 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001161 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001162 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001163 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001164
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001165 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001166 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001167 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001168 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1170 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
1172 /* install new repeat context */
Jack Diederich2d400772006-05-27 15:44:34 +00001173 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001174 if (!ctx->u.rep) {
1175 PyErr_NoMemory();
Neal Norwitzef0de022006-08-12 01:53:28 +00001176 RETURN_FAILURE;
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001177 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 ctx->u.rep->count = -1;
1179 ctx->u.rep->pattern = ctx->pattern;
1180 ctx->u.rep->prev = state->repeat;
1181 ctx->u.rep->last_ptr = NULL;
1182 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 state->repeat = ctx->u.rep->prev;
Jack Diederich2d400772006-05-27 15:44:34 +00001187 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001188
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 if (ret) {
1190 RETURN_ON_ERROR(ret);
1191 RETURN_SUCCESS;
1192 }
1193 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
1195 case SRE_OP_MAX_UNTIL:
1196 /* maximizing repeat */
1197 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1198
1199 /* FIXME: we probably need to deal with zero-width
1200 matches in here... */
1201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep = state->repeat;
1203 if (!ctx->u.rep)
1204 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001209
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001210 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1211 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001212
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001213 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001216 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1217 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 if (ret) {
1219 RETURN_ON_ERROR(ret);
1220 RETURN_SUCCESS;
1221 }
1222 ctx->u.rep->count = ctx->count-1;
1223 state->ptr = ctx->ptr;
1224 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001225 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 if ((ctx->count < ctx->u.rep->pattern[2] ||
1228 ctx->u.rep->pattern[2] == 65535) &&
1229 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001230 /* we may have enough matches, but if we can
1231 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001232 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001233 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001234 MARK_PUSH(ctx->lastmark);
1235 /* zero-width match protection */
1236 DATA_PUSH(&ctx->u.rep->last_ptr);
1237 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1239 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001240 DATA_POP(&ctx->u.rep->last_ptr);
1241 if (ret) {
1242 MARK_POP_DISCARD(ctx->lastmark);
1243 RETURN_ON_ERROR(ret);
1244 RETURN_SUCCESS;
1245 }
1246 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001247 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->u.rep->count = ctx->count-1;
1249 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001250 }
1251
1252 /* cannot match more repeated items here. make sure the
1253 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001255 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 RETURN_ON_SUCCESS(ret);
1257 state->repeat = ctx->u.rep;
1258 state->ptr = ctx->ptr;
1259 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001260
1261 case SRE_OP_MIN_UNTIL:
1262 /* minimizing repeat */
1263 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 ctx->u.rep = state->repeat;
1266 if (!ctx->u.rep)
1267 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001270
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001272
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001273 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1274 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001276 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001277 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001278 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001279 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1280 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 if (ret) {
1282 RETURN_ON_ERROR(ret);
1283 RETURN_SUCCESS;
1284 }
1285 ctx->u.rep->count = ctx->count-1;
1286 state->ptr = ctx->ptr;
1287 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001288 }
1289
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001290 LASTMARK_SAVE();
1291
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001292 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001295 if (ret) {
1296 RETURN_ON_ERROR(ret);
1297 RETURN_SUCCESS;
1298 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001299
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 state->repeat = ctx->u.rep;
1301 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001302
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001303 LASTMARK_RESTORE();
1304
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001305 if (ctx->count >= ctx->u.rep->pattern[2]
1306 && ctx->u.rep->pattern[2] != 65535)
1307 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001308
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001309 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1311 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001312 if (ret) {
1313 RETURN_ON_ERROR(ret);
1314 RETURN_SUCCESS;
1315 }
1316 ctx->u.rep->count = ctx->count-1;
1317 state->ptr = ctx->ptr;
1318 RETURN_FAILURE;
1319
1320 case SRE_OP_GROUPREF:
1321 /* match backreference */
1322 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1323 ctx->ptr, ctx->pattern[0]));
1324 i = ctx->pattern[0];
1325 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001326 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001327 if (groupref >= state->lastmark) {
1328 RETURN_FAILURE;
1329 } else {
1330 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1331 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1332 if (!p || !e || e < p)
1333 RETURN_FAILURE;
1334 while (p < e) {
1335 if (ctx->ptr >= end || *ctx->ptr != *p)
1336 RETURN_FAILURE;
1337 p++; ctx->ptr++;
1338 }
1339 }
1340 }
1341 ctx->pattern++;
1342 break;
1343
1344 case SRE_OP_GROUPREF_IGNORE:
1345 /* match backreference */
1346 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1347 ctx->ptr, ctx->pattern[0]));
1348 i = ctx->pattern[0];
1349 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001350 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001351 if (groupref >= state->lastmark) {
1352 RETURN_FAILURE;
1353 } else {
1354 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1355 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1356 if (!p || !e || e < p)
1357 RETURN_FAILURE;
1358 while (p < e) {
1359 if (ctx->ptr >= end ||
1360 state->lower(*ctx->ptr) != state->lower(*p))
1361 RETURN_FAILURE;
1362 p++; ctx->ptr++;
1363 }
1364 }
1365 }
1366 ctx->pattern++;
1367 break;
1368
1369 case SRE_OP_GROUPREF_EXISTS:
1370 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1371 ctx->ptr, ctx->pattern[0]));
1372 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1373 i = ctx->pattern[0];
1374 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001375 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001376 if (groupref >= state->lastmark) {
1377 ctx->pattern += ctx->pattern[1];
1378 break;
1379 } else {
1380 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1381 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1382 if (!p || !e || e < p) {
1383 ctx->pattern += ctx->pattern[1];
1384 break;
1385 }
1386 }
1387 }
1388 ctx->pattern += 2;
1389 break;
1390
1391 case SRE_OP_ASSERT:
1392 /* assert subpattern */
1393 /* <ASSERT> <skip> <back> <pattern> */
1394 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1395 ctx->ptr, ctx->pattern[1]));
1396 state->ptr = ctx->ptr - ctx->pattern[1];
1397 if (state->ptr < state->beginning)
1398 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001399 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001400 RETURN_ON_FAILURE(ret);
1401 ctx->pattern += ctx->pattern[0];
1402 break;
1403
1404 case SRE_OP_ASSERT_NOT:
1405 /* assert not subpattern */
1406 /* <ASSERT_NOT> <skip> <back> <pattern> */
1407 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1408 ctx->ptr, ctx->pattern[1]));
1409 state->ptr = ctx->ptr - ctx->pattern[1];
1410 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001411 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001412 if (ret) {
1413 RETURN_ON_ERROR(ret);
1414 RETURN_FAILURE;
1415 }
1416 }
1417 ctx->pattern += ctx->pattern[0];
1418 break;
1419
1420 case SRE_OP_FAILURE:
1421 /* immediate failure */
1422 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1423 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001424
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001425 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001426 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1427 ctx->pattern[-1]));
1428 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001429 }
1430 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001431
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001432exit:
1433 ctx_pos = ctx->last_ctx_pos;
1434 jump = ctx->jump;
1435 DATA_POP_DISCARD(ctx);
1436 if (ctx_pos == -1)
1437 return ret;
1438 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1439
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001440 switch (jump) {
1441 case JUMP_MAX_UNTIL_2:
1442 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1443 goto jump_max_until_2;
1444 case JUMP_MAX_UNTIL_3:
1445 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1446 goto jump_max_until_3;
1447 case JUMP_MIN_UNTIL_2:
1448 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1449 goto jump_min_until_2;
1450 case JUMP_MIN_UNTIL_3:
1451 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_until_3;
1453 case JUMP_BRANCH:
1454 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1455 goto jump_branch;
1456 case JUMP_MAX_UNTIL_1:
1457 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1458 goto jump_max_until_1;
1459 case JUMP_MIN_UNTIL_1:
1460 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1461 goto jump_min_until_1;
1462 case JUMP_REPEAT:
1463 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1464 goto jump_repeat;
1465 case JUMP_REPEAT_ONE_1:
1466 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1467 goto jump_repeat_one_1;
1468 case JUMP_REPEAT_ONE_2:
1469 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1470 goto jump_repeat_one_2;
1471 case JUMP_MIN_REPEAT_ONE:
1472 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1473 goto jump_min_repeat_one;
1474 case JUMP_ASSERT:
1475 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1476 goto jump_assert;
1477 case JUMP_ASSERT_NOT:
1478 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1479 goto jump_assert_not;
1480 case JUMP_NONE:
1481 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1482 break;
1483 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001484
1485 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001486}
1487
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001488LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001489SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1490{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001491 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1492 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001493 Py_ssize_t status = 0;
1494 Py_ssize_t prefix_len = 0;
1495 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001496 SRE_CODE* prefix = NULL;
1497 SRE_CODE* charset = NULL;
1498 SRE_CODE* overlap = NULL;
1499 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001500
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001501 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001502 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001503 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504
1505 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001506
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001507 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001508 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001510 end -= pattern[3]-1;
1511 if (end <= ptr)
1512 end = ptr+1;
1513 }
1514
Fredrik Lundh3562f112000-07-02 12:00:07 +00001515 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001516 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001517 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001518 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001519 prefix_skip = pattern[6];
1520 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001521 overlap = prefix + prefix_len - 1;
1522 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001523 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001524 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001525 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001526
1527 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001528 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001529
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001530 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1531 TRACE(("charset = %p\n", charset));
1532
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001533#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001534 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001535 /* pattern starts with a known prefix. use the overlap
1536 table to skip forward as fast as we possibly can */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001537 Py_ssize_t i = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001538 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001539 while (ptr < end) {
1540 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001541 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001542 if (!i)
1543 break;
1544 else
1545 i = overlap[i];
1546 } else {
1547 if (++i == prefix_len) {
1548 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001549 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1550 state->start = ptr + 1 - prefix_len;
1551 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001552 if (flags & SRE_INFO_LITERAL)
1553 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001554 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001555 if (status != 0)
1556 return status;
1557 /* close but no cigar -- try again */
1558 i = overlap[i];
1559 }
1560 break;
1561 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001562 }
1563 ptr++;
1564 }
1565 return 0;
1566 }
1567#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001568
Fredrik Lundh3562f112000-07-02 12:00:07 +00001569 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001571 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001572 SRE_CODE chr = pattern[1];
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001573 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001574 for (;;) {
1575 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1576 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001577 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001579 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 state->start = ptr;
1581 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001582 if (flags & SRE_INFO_LITERAL)
1583 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001584 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 if (status != 0)
1586 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001587 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001588 } else if (charset) {
1589 /* pattern starts with a character from a known set */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001590 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001592 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001594 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001595 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001596 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001597 state->start = ptr;
1598 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001599 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 if (status != 0)
1601 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001602 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001603 }
1604 } else
1605 /* general case */
1606 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001607 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001608 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001609 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001610 if (status != 0)
1611 break;
1612 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001613
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001614 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001615}
Tim Peters3d563502006-01-21 02:47:53 +00001616
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001617LOCAL(int)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001618SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001619{
1620 /* check if given string is a literal template (i.e. no escapes) */
1621 while (len-- > 0)
1622 if (*ptr++ == '\\')
1623 return 0;
1624 return 1;
1625}
Guido van Rossumb700df92000-03-31 14:59:30 +00001626
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001627#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001628
1629/* -------------------------------------------------------------------- */
1630/* factories and destructors */
1631
1632/* see sre.h for object declarations */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001633static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1634static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001635
1636static PyObject *
Georg Brandl964f5972006-05-28 22:38:57 +00001637sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001638{
Benjamin Peterson9dccb012013-01-10 10:37:47 -06001639 return PyInt_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001640}
1641
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001642static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001643sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001644{
1645 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001646 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001647 return NULL;
1648 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001649 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001650 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001651#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001652 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001653#else
1654 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001655#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001656 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001657}
1658
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001659LOCAL(void)
1660state_reset(SRE_STATE* state)
1661{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001662 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001663 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001665 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001666 state->lastindex = -1;
1667
1668 state->repeat = NULL;
1669
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001670 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001671}
1672
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001673static void*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001674getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001675{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001676 /* given a python object, return a data pointer, a length (in
1677 characters), and a character size. return NULL if the object
1678 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001679
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001680 PyBufferProcs *buffer;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001681 Py_ssize_t size, bytes;
1682 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001683 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001684
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001685#if defined(HAVE_UNICODE)
1686 if (PyUnicode_Check(string)) {
1687 /* unicode strings doesn't always support the buffer interface */
1688 ptr = (void*) PyUnicode_AS_DATA(string);
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00001689 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001690 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001691 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001692
1693 } else {
1694#endif
1695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 /* get pointer to string buffer */
Christian Heimese93237d2007-12-19 02:37:44 +00001697 buffer = Py_TYPE(string)->tp_as_buffer;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001698 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1699 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001700 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 return NULL;
1702 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001703
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001704 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1706 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001707 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1708 return NULL;
1709 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001710
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001711 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001712#if PY_VERSION_HEX >= 0x01060000
1713 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001714#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001715 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001716#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001717
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001718 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001719 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001720#if defined(HAVE_UNICODE)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001721 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001722 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001723#endif
1724 else {
1725 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1726 return NULL;
1727 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001728
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001729#if defined(HAVE_UNICODE)
1730 }
1731#endif
1732
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001733 *p_length = size;
1734 *p_charsize = charsize;
1735
1736 return ptr;
1737}
1738
1739LOCAL(PyObject*)
1740state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001741 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001742{
1743 /* prepare state object */
1744
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001745 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001746 int charsize;
1747 void* ptr;
1748
1749 memset(state, 0, sizeof(SRE_STATE));
1750
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001751 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001752 state->lastindex = -1;
1753
1754 ptr = getstring(string, &length, &charsize);
1755 if (!ptr)
1756 return NULL;
1757
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001758 /* adjust boundaries */
1759 if (start < 0)
1760 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001761 else if (start > length)
1762 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001764 if (end < 0)
1765 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 else if (end > length)
1767 end = length;
1768
1769 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001770
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001771 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001772
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001773 state->start = (void*) ((char*) ptr + start * state->charsize);
1774 state->end = (void*) ((char*) ptr + end * state->charsize);
1775
1776 Py_INCREF(string);
1777 state->string = string;
1778 state->pos = start;
1779 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001780
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001781 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001782 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001783 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001784#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001785 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001786#else
1787 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001788#endif
1789 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001790 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001791
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001792 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001793}
1794
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001795LOCAL(void)
1796state_fini(SRE_STATE* state)
1797{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001798 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001799 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001800}
1801
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001802/* calculate offset from start of string */
1803#define STATE_OFFSET(state, member)\
1804 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1805
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001807state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001808{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001809 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001810
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001811 index = (index - 1) * 2;
1812
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001813 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001814 if (empty)
1815 /* want empty string */
1816 i = j = 0;
1817 else {
1818 Py_INCREF(Py_None);
1819 return Py_None;
1820 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001821 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001822 i = STATE_OFFSET(state, state->mark[index]);
1823 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001824 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001825
Fredrik Lundh58100642000-08-09 09:14:35 +00001826 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001827}
1828
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001829static void
1830pattern_error(int status)
1831{
1832 switch (status) {
1833 case SRE_ERROR_RECURSION_LIMIT:
1834 PyErr_SetString(
1835 PyExc_RuntimeError,
1836 "maximum recursion limit exceeded"
1837 );
1838 break;
1839 case SRE_ERROR_MEMORY:
1840 PyErr_NoMemory();
1841 break;
Facundo Batista4473d222008-01-08 21:10:12 +00001842 case SRE_ERROR_INTERRUPTED:
1843 /* An exception has already been raised, so let it fly */
1844 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001845 default:
1846 /* other error codes indicate compiler/engine bugs */
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "internal error in regular expression engine"
1850 );
1851 }
1852}
1853
Guido van Rossumb700df92000-03-31 14:59:30 +00001854static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001855pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001856{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001857 if (self->weakreflist != NULL)
1858 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001859 Py_XDECREF(self->pattern);
1860 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001861 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001862 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001863}
1864
1865static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001866pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001867{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001868 SRE_STATE state;
1869 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001870
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001871 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001872 Py_ssize_t start = 0;
1873 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001874 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001875 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001876 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001877 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001878
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001879 string = state_init(&state, self, string, start, end);
1880 if (!string)
1881 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 state.ptr = state.start;
1884
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001885 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1886
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001887 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001888 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001890#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001891 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001892#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001893 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001894
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001895 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001896 if (PyErr_Occurred())
1897 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001898
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001900
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001901 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001902}
1903
1904static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001905pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001906{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001907 SRE_STATE state;
1908 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001909
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001910 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001911 Py_ssize_t start = 0;
1912 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001913 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001914 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001915 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001918 string = state_init(&state, self, string, start, end);
1919 if (!string)
1920 return NULL;
1921
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001922 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001924 if (state.charsize == 1) {
1925 status = sre_search(&state, PatternObject_GetCode(self));
1926 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001927#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001928 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001929#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001931
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001932 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1933
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001934 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001935
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001936 if (PyErr_Occurred())
1937 return NULL;
1938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001940}
1941
1942static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001943call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001944{
1945 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001946 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001947 PyObject* func;
1948 PyObject* result;
1949
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001950 if (!args)
1951 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001952 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001953 if (!name)
1954 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001955 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001956 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001957 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001958 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001959 func = PyObject_GetAttrString(mod, function);
1960 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961 if (!func)
1962 return NULL;
1963 result = PyObject_CallObject(func, args);
1964 Py_DECREF(func);
1965 Py_DECREF(args);
1966 return result;
1967}
1968
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001969#ifdef USE_BUILTIN_COPY
1970static int
1971deepcopy(PyObject** object, PyObject* memo)
1972{
1973 PyObject* copy;
1974
1975 copy = call(
1976 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001977 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001978 );
1979 if (!copy)
1980 return 0;
1981
1982 Py_DECREF(*object);
1983 *object = copy;
1984
1985 return 1; /* success */
1986}
1987#endif
1988
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001989static PyObject*
Guido van Rossum1ff91d92007-09-10 22:02:25 +00001990join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001991{
1992 /* join list elements */
1993
1994 PyObject* joiner;
1995#if PY_VERSION_HEX >= 0x01060000
1996 PyObject* function;
1997 PyObject* args;
1998#endif
1999 PyObject* result;
2000
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002001 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002002 if (!joiner)
2003 return NULL;
2004
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002005 if (PyList_GET_SIZE(list) == 0) {
2006 Py_DECREF(list);
2007 return joiner;
2008 }
2009
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002010#if PY_VERSION_HEX >= 0x01060000
2011 function = PyObject_GetAttrString(joiner, "join");
2012 if (!function) {
2013 Py_DECREF(joiner);
2014 return NULL;
2015 }
2016 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002017 if (!args) {
2018 Py_DECREF(function);
2019 Py_DECREF(joiner);
2020 return NULL;
2021 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002022 PyTuple_SET_ITEM(args, 0, list);
2023 result = PyObject_CallObject(function, args);
2024 Py_DECREF(args); /* also removes list */
2025 Py_DECREF(function);
2026#else
2027 result = call(
2028 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002029 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002030 );
2031#endif
2032 Py_DECREF(joiner);
2033
2034 return result;
2035}
2036
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002037static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002038pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002039{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002040 SRE_STATE state;
2041 PyObject* list;
2042 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002043 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002044
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002046 Py_ssize_t start = 0;
2047 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002048 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002049 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002050 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 string = state_init(&state, self, string, start, end);
2054 if (!string)
2055 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002058 if (!list) {
2059 state_fini(&state);
2060 return NULL;
2061 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002064
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002066
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002067 state_reset(&state);
2068
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002069 state.ptr = state.start;
2070
2071 if (state.charsize == 1) {
2072 status = sre_search(&state, PatternObject_GetCode(self));
2073 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002074#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002076#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002077 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002078
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002079 if (PyErr_Occurred())
2080 goto error;
2081
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002082 if (status <= 0) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002083 if (status == 0)
2084 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002085 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002087 }
Tim Peters3d563502006-01-21 02:47:53 +00002088
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002089 /* don't bother to build a match object */
2090 switch (self->groups) {
2091 case 0:
2092 b = STATE_OFFSET(&state, state.start);
2093 e = STATE_OFFSET(&state, state.ptr);
2094 item = PySequence_GetSlice(string, b, e);
2095 if (!item)
2096 goto error;
2097 break;
2098 case 1:
2099 item = state_getslice(&state, 1, string, 1);
2100 if (!item)
2101 goto error;
2102 break;
2103 default:
2104 item = PyTuple_New(self->groups);
2105 if (!item)
2106 goto error;
2107 for (i = 0; i < self->groups; i++) {
2108 PyObject* o = state_getslice(&state, i+1, string, 1);
2109 if (!o) {
2110 Py_DECREF(item);
2111 goto error;
2112 }
2113 PyTuple_SET_ITEM(item, i, o);
2114 }
2115 break;
2116 }
2117
2118 status = PyList_Append(list, item);
2119 Py_DECREF(item);
2120 if (status < 0)
2121 goto error;
2122
2123 if (state.ptr == state.start)
2124 state.start = (void*) ((char*) state.ptr + state.charsize);
2125 else
2126 state.start = state.ptr;
2127
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002129
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002130 state_fini(&state);
2131 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002132
2133error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002134 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 state_fini(&state);
2136 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002137
Guido van Rossumb700df92000-03-31 14:59:30 +00002138}
2139
Fredrik Lundh703ce812001-10-24 22:16:30 +00002140#if PY_VERSION_HEX >= 0x02020000
2141static PyObject*
2142pattern_finditer(PatternObject* pattern, PyObject* args)
2143{
2144 PyObject* scanner;
2145 PyObject* search;
2146 PyObject* iterator;
2147
2148 scanner = pattern_scanner(pattern, args);
2149 if (!scanner)
2150 return NULL;
2151
2152 search = PyObject_GetAttrString(scanner, "search");
2153 Py_DECREF(scanner);
2154 if (!search)
2155 return NULL;
2156
2157 iterator = PyCallIter_New(search, Py_None);
2158 Py_DECREF(search);
2159
2160 return iterator;
2161}
2162#endif
2163
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002164static PyObject*
2165pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2166{
2167 SRE_STATE state;
2168 PyObject* list;
2169 PyObject* item;
2170 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002171 Py_ssize_t n;
2172 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002173 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002174
2175 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002176 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002177 static char* kwlist[] = { "source", "maxsplit", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002178 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002179 &string, &maxsplit))
2180 return NULL;
2181
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002182 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002183 if (!string)
2184 return NULL;
2185
2186 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002187 if (!list) {
2188 state_fini(&state);
2189 return NULL;
2190 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002191
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002192 n = 0;
2193 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002194
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002195 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002196
2197 state_reset(&state);
2198
2199 state.ptr = state.start;
2200
2201 if (state.charsize == 1) {
2202 status = sre_search(&state, PatternObject_GetCode(self));
2203 } else {
2204#if defined(HAVE_UNICODE)
2205 status = sre_usearch(&state, PatternObject_GetCode(self));
2206#endif
2207 }
2208
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002209 if (PyErr_Occurred())
2210 goto error;
2211
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002212 if (status <= 0) {
2213 if (status == 0)
2214 break;
2215 pattern_error(status);
2216 goto error;
2217 }
Tim Peters3d563502006-01-21 02:47:53 +00002218
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002219 if (state.start == state.ptr) {
2220 if (last == state.end)
2221 break;
2222 /* skip one character */
2223 state.start = (void*) ((char*) state.ptr + state.charsize);
2224 continue;
2225 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002226
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002227 /* get segment before this match */
2228 item = PySequence_GetSlice(
2229 string, STATE_OFFSET(&state, last),
2230 STATE_OFFSET(&state, state.start)
2231 );
2232 if (!item)
2233 goto error;
2234 status = PyList_Append(list, item);
2235 Py_DECREF(item);
2236 if (status < 0)
2237 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002238
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002239 /* add groups (if any) */
2240 for (i = 0; i < self->groups; i++) {
2241 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002242 if (!item)
2243 goto error;
2244 status = PyList_Append(list, item);
2245 Py_DECREF(item);
2246 if (status < 0)
2247 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002248 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002249
2250 n = n + 1;
2251
2252 last = state.start = state.ptr;
2253
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002254 }
2255
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002256 /* get segment following last match (even if empty) */
2257 item = PySequence_GetSlice(
2258 string, STATE_OFFSET(&state, last), state.endpos
2259 );
2260 if (!item)
2261 goto error;
2262 status = PyList_Append(list, item);
2263 Py_DECREF(item);
2264 if (status < 0)
2265 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002266
2267 state_fini(&state);
2268 return list;
2269
2270error:
2271 Py_DECREF(list);
2272 state_fini(&state);
2273 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002274
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002275}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002276
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002277static PyObject*
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002278pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002279 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002280{
2281 SRE_STATE state;
2282 PyObject* list;
2283 PyObject* item;
2284 PyObject* filter;
2285 PyObject* args;
2286 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002287 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002288 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002289 Py_ssize_t n;
2290 Py_ssize_t i, b, e;
2291 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002292 int filter_is_callable;
2293
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002294 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002295 /* sub/subn takes either a function or a template */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002296 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002297 Py_INCREF(filter);
2298 filter_is_callable = 1;
2299 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002300 /* if not callable, check if it's a literal string */
2301 int literal;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002302 ptr = getstring(ptemplate, &n, &bint);
2303 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002304 if (ptr) {
2305 if (b == 1) {
Skip Montanaro816a1622006-04-18 11:53:09 +00002306 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002307 } else {
2308#if defined(HAVE_UNICODE)
Skip Montanaro816a1622006-04-18 11:53:09 +00002309 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002310#endif
2311 }
2312 } else {
2313 PyErr_Clear();
2314 literal = 0;
2315 }
2316 if (literal) {
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002317 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002318 Py_INCREF(filter);
2319 filter_is_callable = 0;
2320 } else {
2321 /* not a literal; hand it over to the template compiler */
2322 filter = call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002323 SRE_PY_MODULE, "_subx",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002324 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002325 );
2326 if (!filter)
2327 return NULL;
2328 filter_is_callable = PyCallable_Check(filter);
2329 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002330 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002331
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002332 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002333 if (!string) {
2334 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002335 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002336 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002337
2338 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002339 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002340 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002341 state_fini(&state);
2342 return NULL;
2343 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002344
2345 n = i = 0;
2346
2347 while (!count || n < count) {
2348
2349 state_reset(&state);
2350
2351 state.ptr = state.start;
2352
2353 if (state.charsize == 1) {
2354 status = sre_search(&state, PatternObject_GetCode(self));
2355 } else {
2356#if defined(HAVE_UNICODE)
2357 status = sre_usearch(&state, PatternObject_GetCode(self));
2358#endif
2359 }
2360
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002361 if (PyErr_Occurred())
2362 goto error;
2363
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002364 if (status <= 0) {
2365 if (status == 0)
2366 break;
2367 pattern_error(status);
2368 goto error;
2369 }
Tim Peters3d563502006-01-21 02:47:53 +00002370
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002371 b = STATE_OFFSET(&state, state.start);
2372 e = STATE_OFFSET(&state, state.ptr);
2373
2374 if (i < b) {
2375 /* get segment before this match */
2376 item = PySequence_GetSlice(string, i, b);
2377 if (!item)
2378 goto error;
2379 status = PyList_Append(list, item);
2380 Py_DECREF(item);
2381 if (status < 0)
2382 goto error;
2383
2384 } else if (i == b && i == e && n > 0)
2385 /* ignore empty match on latest position */
2386 goto next;
2387
2388 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002389 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002390 match = pattern_new_match(self, &state, 1);
2391 if (!match)
2392 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002393 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002394 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002395 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002396 goto error;
2397 }
2398 item = PyObject_CallObject(filter, args);
2399 Py_DECREF(args);
2400 Py_DECREF(match);
2401 if (!item)
2402 goto error;
2403 } else {
2404 /* filter is literal string */
2405 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002406 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002407 }
2408
2409 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002410 if (item != Py_None) {
2411 status = PyList_Append(list, item);
2412 Py_DECREF(item);
2413 if (status < 0)
2414 goto error;
2415 }
Tim Peters3d563502006-01-21 02:47:53 +00002416
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002417 i = e;
2418 n = n + 1;
2419
2420next:
2421 /* move on */
2422 if (state.ptr == state.start)
2423 state.start = (void*) ((char*) state.ptr + state.charsize);
2424 else
2425 state.start = state.ptr;
2426
2427 }
2428
2429 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002430 if (i < state.endpos) {
2431 item = PySequence_GetSlice(string, i, state.endpos);
2432 if (!item)
2433 goto error;
2434 status = PyList_Append(list, item);
2435 Py_DECREF(item);
2436 if (status < 0)
2437 goto error;
2438 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439
2440 state_fini(&state);
2441
Guido van Rossum4e173842001-12-07 04:25:10 +00002442 Py_DECREF(filter);
2443
Fredrik Lundhdac58492001-10-21 21:48:30 +00002444 /* convert list to single string (also removes list) */
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002445 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002446
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002447 if (!item)
2448 return NULL;
2449
2450 if (subn)
Antoine Pitroub83575b2012-12-02 12:52:36 +01002451 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002452
2453 return item;
2454
2455error:
2456 Py_DECREF(list);
2457 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002458 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002459 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002460
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002461}
2462
2463static PyObject*
2464pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2465{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002466 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002467 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002468 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002469 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002470 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002471 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472 return NULL;
2473
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002474 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002475}
2476
2477static PyObject*
2478pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2479{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002480 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002481 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002482 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002483 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002484 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002485 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002486 return NULL;
2487
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002488 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002489}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002490
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002491static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00002492pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002493{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002494#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002495 PatternObject* copy;
2496 int offset;
2497
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002498 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2499 if (!copy)
2500 return NULL;
2501
2502 offset = offsetof(PatternObject, groups);
2503
2504 Py_XINCREF(self->groupindex);
2505 Py_XINCREF(self->indexgroup);
2506 Py_XINCREF(self->pattern);
2507
2508 memcpy((char*) copy + offset, (char*) self + offset,
2509 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002510 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002511
2512 return (PyObject*) copy;
2513#else
2514 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2515 return NULL;
2516#endif
2517}
2518
2519static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00002520pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002521{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002522#ifdef USE_BUILTIN_COPY
2523 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002524
Georg Brandlfbef5882006-05-28 22:14:04 +00002525 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002526 if (!copy)
2527 return NULL;
2528
2529 if (!deepcopy(&copy->groupindex, memo) ||
2530 !deepcopy(&copy->indexgroup, memo) ||
2531 !deepcopy(&copy->pattern, memo)) {
2532 Py_DECREF(copy);
2533 return NULL;
2534 }
2535
2536#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002537 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2538 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002539#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002540}
2541
Raymond Hettinger94478742004-09-24 04:31:19 +00002542PyDoc_STRVAR(pattern_match_doc,
2543"match(string[, pos[, endpos]]) --> match object or None.\n\
2544 Matches zero or more characters at the beginning of the string");
2545
2546PyDoc_STRVAR(pattern_search_doc,
2547"search(string[, pos[, endpos]]) --> match object or None.\n\
2548 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlovc08ded92012-12-25 18:50:03 +02002549 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002550
2551PyDoc_STRVAR(pattern_split_doc,
2552"split(string[, maxsplit = 0]) --> list.\n\
2553 Split string by the occurrences of pattern.");
2554
2555PyDoc_STRVAR(pattern_findall_doc,
2556"findall(string[, pos[, endpos]]) --> list.\n\
2557 Return a list of all non-overlapping matches of pattern in string.");
2558
2559PyDoc_STRVAR(pattern_finditer_doc,
2560"finditer(string[, pos[, endpos]]) --> iterator.\n\
2561 Return an iterator over all non-overlapping matches for the \n\
2562 RE pattern in string. For each match, the iterator returns a\n\
2563 match object.");
2564
2565PyDoc_STRVAR(pattern_sub_doc,
2566"sub(repl, string[, count = 0]) --> newstring\n\
2567 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002568 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002569
2570PyDoc_STRVAR(pattern_subn_doc,
2571"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2572 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2573 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002574 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002575
2576PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2577
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002578static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002579 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002580 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002581 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002582 pattern_search_doc},
2583 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2584 pattern_sub_doc},
2585 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2586 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002587 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002588 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002589 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002590 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002591#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002592 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2593 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002594#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002595 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00002596 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2597 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002598 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002599};
2600
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002601#define PAT_OFF(x) offsetof(PatternObject, x)
2602static PyMemberDef pattern_members[] = {
2603 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2604 {"flags", T_INT, PAT_OFF(flags), READONLY},
2605 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2606 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2607 {NULL} /* Sentinel */
2608};
Guido van Rossumb700df92000-03-31 14:59:30 +00002609
2610statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002611 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002612 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002613 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002614 (destructor)pattern_dealloc, /*tp_dealloc*/
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002615 0, /* tp_print */
2616 0, /* tp_getattrn */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002617 0, /* tp_setattr */
2618 0, /* tp_compare */
2619 0, /* tp_repr */
2620 0, /* tp_as_number */
2621 0, /* tp_as_sequence */
2622 0, /* tp_as_mapping */
2623 0, /* tp_hash */
2624 0, /* tp_call */
2625 0, /* tp_str */
2626 0, /* tp_getattro */
2627 0, /* tp_setattro */
2628 0, /* tp_as_buffer */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002629 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002630 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002631 0, /* tp_traverse */
2632 0, /* tp_clear */
2633 0, /* tp_richcompare */
2634 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002635 0, /* tp_iter */
2636 0, /* tp_iternext */
2637 pattern_methods, /* tp_methods */
2638 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002639};
2640
Guido van Rossum8b762f02008-08-05 03:39:21 +00002641static int _validate(PatternObject *self); /* Forward */
2642
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002643static PyObject *
2644_compile(PyObject* self_, PyObject* args)
2645{
2646 /* "compile" pattern descriptor to pattern object */
2647
2648 PatternObject* self;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002649 Py_ssize_t i, n;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002650
2651 PyObject* pattern;
2652 int flags = 0;
2653 PyObject* code;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002654 Py_ssize_t groups = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002655 PyObject* groupindex = NULL;
2656 PyObject* indexgroup = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002657 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002658 &PyList_Type, &code, &groups,
2659 &groupindex, &indexgroup))
2660 return NULL;
2661
2662 n = PyList_GET_SIZE(code);
Christian Heimes4956d2b2008-01-18 19:12:56 +00002663 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002664 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2665 if (!self)
2666 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002667 self->weakreflist = NULL;
2668 self->pattern = NULL;
2669 self->groupindex = NULL;
2670 self->indexgroup = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002671
2672 self->codesize = n;
2673
2674 for (i = 0; i < n; i++) {
2675 PyObject *o = PyList_GET_ITEM(code, i);
2676 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2677 : PyLong_AsUnsignedLong(o);
Antoine Pitroub83ea142012-11-20 22:30:42 +01002678 if (value == (unsigned long)-1 && PyErr_Occurred()) {
2679 if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2680 PyErr_SetString(PyExc_OverflowError,
2681 "regular expression code size limit exceeded");
2682 }
2683 break;
2684 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002685 self->code[i] = (SRE_CODE) value;
2686 if ((unsigned long) self->code[i] != value) {
2687 PyErr_SetString(PyExc_OverflowError,
2688 "regular expression code size limit exceeded");
2689 break;
2690 }
2691 }
2692
2693 if (PyErr_Occurred()) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002694 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002695 return NULL;
2696 }
2697
2698 Py_INCREF(pattern);
2699 self->pattern = pattern;
2700
2701 self->flags = flags;
2702
2703 self->groups = groups;
2704
2705 Py_XINCREF(groupindex);
2706 self->groupindex = groupindex;
2707
2708 Py_XINCREF(indexgroup);
2709 self->indexgroup = indexgroup;
2710
2711 self->weakreflist = NULL;
2712
Guido van Rossum8b762f02008-08-05 03:39:21 +00002713 if (!_validate(self)) {
2714 Py_DECREF(self);
2715 return NULL;
2716 }
2717
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002718 return (PyObject*) self;
2719}
2720
Guido van Rossumb700df92000-03-31 14:59:30 +00002721/* -------------------------------------------------------------------- */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002722/* Code validation */
2723
2724/* To learn more about this code, have a look at the _compile() function in
2725 Lib/sre_compile.py. The validation functions below checks the code array
2726 for conformance with the code patterns generated there.
2727
2728 The nice thing about the generated code is that it is position-independent:
2729 all jumps are relative jumps forward. Also, jumps don't cross each other:
2730 the target of a later jump is always earlier than the target of an earlier
2731 jump. IOW, this is okay:
2732
2733 J---------J-------T--------T
2734 \ \_____/ /
2735 \______________________/
2736
2737 but this is not:
2738
2739 J---------J-------T--------T
2740 \_________\_____/ /
2741 \____________/
2742
2743 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2744 bytes wide (the latter if Python is compiled for "wide" unicode support).
2745*/
2746
2747/* Defining this one enables tracing of the validator */
2748#undef VVERBOSE
2749
2750/* Trace macro for the validator */
2751#if defined(VVERBOSE)
2752#define VTRACE(v) printf v
2753#else
Senthil Kumarand5830682011-10-20 02:13:23 +08002754#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002755#endif
2756
2757/* Report failure */
2758#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2759
2760/* Extract opcode, argument, or skip count from code array */
2761#define GET_OP \
2762 do { \
2763 VTRACE(("%p: ", code)); \
2764 if (code >= end) FAIL; \
2765 op = *code++; \
2766 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2767 } while (0)
2768#define GET_ARG \
2769 do { \
2770 VTRACE(("%p= ", code)); \
2771 if (code >= end) FAIL; \
2772 arg = *code++; \
2773 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2774 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002775#define GET_SKIP_ADJ(adj) \
Guido van Rossum8b762f02008-08-05 03:39:21 +00002776 do { \
2777 VTRACE(("%p= ", code)); \
2778 if (code >= end) FAIL; \
2779 skip = *code; \
2780 VTRACE(("%lu (skip to %p)\n", \
2781 (unsigned long)skip, code+skip)); \
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002782 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum8b762f02008-08-05 03:39:21 +00002783 FAIL; \
2784 code++; \
2785 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002786#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002787
2788static int
2789_validate_charset(SRE_CODE *code, SRE_CODE *end)
2790{
2791 /* Some variables are manipulated by the macros above */
2792 SRE_CODE op;
2793 SRE_CODE arg;
2794 SRE_CODE offset;
2795 int i;
2796
2797 while (code < end) {
2798 GET_OP;
2799 switch (op) {
2800
2801 case SRE_OP_NEGATE:
2802 break;
2803
2804 case SRE_OP_LITERAL:
2805 GET_ARG;
2806 break;
2807
2808 case SRE_OP_RANGE:
2809 GET_ARG;
2810 GET_ARG;
2811 break;
2812
2813 case SRE_OP_CHARSET:
2814 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2815 if (code+offset < code || code+offset > end)
2816 FAIL;
2817 code += offset;
2818 break;
2819
2820 case SRE_OP_BIGCHARSET:
2821 GET_ARG; /* Number of blocks */
2822 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2823 if (code+offset < code || code+offset > end)
2824 FAIL;
2825 /* Make sure that each byte points to a valid block */
2826 for (i = 0; i < 256; i++) {
2827 if (((unsigned char *)code)[i] >= arg)
2828 FAIL;
2829 }
2830 code += offset;
2831 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2832 if (code+offset < code || code+offset > end)
2833 FAIL;
2834 code += offset;
2835 break;
2836
2837 case SRE_OP_CATEGORY:
2838 GET_ARG;
2839 switch (arg) {
2840 case SRE_CATEGORY_DIGIT:
2841 case SRE_CATEGORY_NOT_DIGIT:
2842 case SRE_CATEGORY_SPACE:
2843 case SRE_CATEGORY_NOT_SPACE:
2844 case SRE_CATEGORY_WORD:
2845 case SRE_CATEGORY_NOT_WORD:
2846 case SRE_CATEGORY_LINEBREAK:
2847 case SRE_CATEGORY_NOT_LINEBREAK:
2848 case SRE_CATEGORY_LOC_WORD:
2849 case SRE_CATEGORY_LOC_NOT_WORD:
2850 case SRE_CATEGORY_UNI_DIGIT:
2851 case SRE_CATEGORY_UNI_NOT_DIGIT:
2852 case SRE_CATEGORY_UNI_SPACE:
2853 case SRE_CATEGORY_UNI_NOT_SPACE:
2854 case SRE_CATEGORY_UNI_WORD:
2855 case SRE_CATEGORY_UNI_NOT_WORD:
2856 case SRE_CATEGORY_UNI_LINEBREAK:
2857 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2858 break;
2859 default:
2860 FAIL;
2861 }
2862 break;
2863
2864 default:
2865 FAIL;
2866
2867 }
2868 }
2869
2870 return 1;
2871}
2872
2873static int
2874_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2875{
2876 /* Some variables are manipulated by the macros above */
2877 SRE_CODE op;
2878 SRE_CODE arg;
2879 SRE_CODE skip;
2880
2881 VTRACE(("code=%p, end=%p\n", code, end));
2882
2883 if (code > end)
2884 FAIL;
2885
2886 while (code < end) {
2887 GET_OP;
2888 switch (op) {
2889
2890 case SRE_OP_MARK:
2891 /* We don't check whether marks are properly nested; the
2892 sre_match() code is robust even if they don't, and the worst
2893 you can get is nonsensical match results. */
2894 GET_ARG;
2895 if (arg > 2*groups+1) {
2896 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2897 FAIL;
2898 }
2899 break;
2900
2901 case SRE_OP_LITERAL:
2902 case SRE_OP_NOT_LITERAL:
2903 case SRE_OP_LITERAL_IGNORE:
2904 case SRE_OP_NOT_LITERAL_IGNORE:
2905 GET_ARG;
2906 /* The arg is just a character, nothing to check */
2907 break;
2908
2909 case SRE_OP_SUCCESS:
2910 case SRE_OP_FAILURE:
2911 /* Nothing to check; these normally end the matching process */
2912 break;
2913
2914 case SRE_OP_AT:
2915 GET_ARG;
2916 switch (arg) {
2917 case SRE_AT_BEGINNING:
2918 case SRE_AT_BEGINNING_STRING:
2919 case SRE_AT_BEGINNING_LINE:
2920 case SRE_AT_END:
2921 case SRE_AT_END_LINE:
2922 case SRE_AT_END_STRING:
2923 case SRE_AT_BOUNDARY:
2924 case SRE_AT_NON_BOUNDARY:
2925 case SRE_AT_LOC_BOUNDARY:
2926 case SRE_AT_LOC_NON_BOUNDARY:
2927 case SRE_AT_UNI_BOUNDARY:
2928 case SRE_AT_UNI_NON_BOUNDARY:
2929 break;
2930 default:
2931 FAIL;
2932 }
2933 break;
2934
2935 case SRE_OP_ANY:
2936 case SRE_OP_ANY_ALL:
2937 /* These have no operands */
2938 break;
2939
2940 case SRE_OP_IN:
2941 case SRE_OP_IN_IGNORE:
2942 GET_SKIP;
2943 /* Stop 1 before the end; we check the FAILURE below */
2944 if (!_validate_charset(code, code+skip-2))
2945 FAIL;
2946 if (code[skip-2] != SRE_OP_FAILURE)
2947 FAIL;
2948 code += skip-1;
2949 break;
2950
2951 case SRE_OP_INFO:
2952 {
2953 /* A minimal info field is
2954 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2955 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2956 more follows. */
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002957 SRE_CODE flags, i;
Guido van Rossum8b762f02008-08-05 03:39:21 +00002958 SRE_CODE *newcode;
2959 GET_SKIP;
2960 newcode = code+skip-1;
2961 GET_ARG; flags = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002962 GET_ARG; /* min */
2963 GET_ARG; /* max */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002964 /* Check that only valid flags are present */
2965 if ((flags & ~(SRE_INFO_PREFIX |
2966 SRE_INFO_LITERAL |
2967 SRE_INFO_CHARSET)) != 0)
2968 FAIL;
2969 /* PREFIX and CHARSET are mutually exclusive */
2970 if ((flags & SRE_INFO_PREFIX) &&
2971 (flags & SRE_INFO_CHARSET))
2972 FAIL;
2973 /* LITERAL implies PREFIX */
2974 if ((flags & SRE_INFO_LITERAL) &&
2975 !(flags & SRE_INFO_PREFIX))
2976 FAIL;
2977 /* Validate the prefix */
2978 if (flags & SRE_INFO_PREFIX) {
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002979 SRE_CODE prefix_len;
Guido van Rossum8b762f02008-08-05 03:39:21 +00002980 GET_ARG; prefix_len = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002981 GET_ARG; /* prefix skip */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002982 /* Here comes the prefix string */
2983 if (code+prefix_len < code || code+prefix_len > newcode)
2984 FAIL;
2985 code += prefix_len;
2986 /* And here comes the overlap table */
2987 if (code+prefix_len < code || code+prefix_len > newcode)
2988 FAIL;
2989 /* Each overlap value should be < prefix_len */
2990 for (i = 0; i < prefix_len; i++) {
2991 if (code[i] >= prefix_len)
2992 FAIL;
2993 }
2994 code += prefix_len;
2995 }
2996 /* Validate the charset */
2997 if (flags & SRE_INFO_CHARSET) {
2998 if (!_validate_charset(code, newcode-1))
2999 FAIL;
3000 if (newcode[-1] != SRE_OP_FAILURE)
3001 FAIL;
3002 code = newcode;
3003 }
3004 else if (code != newcode) {
3005 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3006 FAIL;
3007 }
3008 }
3009 break;
3010
3011 case SRE_OP_BRANCH:
3012 {
3013 SRE_CODE *target = NULL;
3014 for (;;) {
3015 GET_SKIP;
3016 if (skip == 0)
3017 break;
3018 /* Stop 2 before the end; we check the JUMP below */
3019 if (!_validate_inner(code, code+skip-3, groups))
3020 FAIL;
3021 code += skip-3;
3022 /* Check that it ends with a JUMP, and that each JUMP
3023 has the same target */
3024 GET_OP;
3025 if (op != SRE_OP_JUMP)
3026 FAIL;
3027 GET_SKIP;
3028 if (target == NULL)
3029 target = code+skip-1;
3030 else if (code+skip-1 != target)
3031 FAIL;
3032 }
3033 }
3034 break;
3035
3036 case SRE_OP_REPEAT_ONE:
3037 case SRE_OP_MIN_REPEAT_ONE:
3038 {
3039 SRE_CODE min, max;
3040 GET_SKIP;
3041 GET_ARG; min = arg;
3042 GET_ARG; max = arg;
3043 if (min > max)
3044 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003045 if (max > 65535)
3046 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003047 if (!_validate_inner(code, code+skip-4, groups))
3048 FAIL;
3049 code += skip-4;
3050 GET_OP;
3051 if (op != SRE_OP_SUCCESS)
3052 FAIL;
3053 }
3054 break;
3055
3056 case SRE_OP_REPEAT:
3057 {
3058 SRE_CODE min, max;
3059 GET_SKIP;
3060 GET_ARG; min = arg;
3061 GET_ARG; max = arg;
3062 if (min > max)
3063 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003064 if (max > 65535)
3065 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003066 if (!_validate_inner(code, code+skip-3, groups))
3067 FAIL;
3068 code += skip-3;
3069 GET_OP;
3070 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3071 FAIL;
3072 }
3073 break;
3074
3075 case SRE_OP_GROUPREF:
3076 case SRE_OP_GROUPREF_IGNORE:
3077 GET_ARG;
3078 if (arg >= groups)
3079 FAIL;
3080 break;
3081
3082 case SRE_OP_GROUPREF_EXISTS:
3083 /* The regex syntax for this is: '(?(group)then|else)', where
3084 'group' is either an integer group number or a group name,
3085 'then' and 'else' are sub-regexes, and 'else' is optional. */
3086 GET_ARG;
3087 if (arg >= groups)
3088 FAIL;
Guido van Rossume3c4fd92008-09-10 14:27:00 +00003089 GET_SKIP_ADJ(1);
Guido van Rossum8b762f02008-08-05 03:39:21 +00003090 code--; /* The skip is relative to the first arg! */
3091 /* There are two possibilities here: if there is both a 'then'
3092 part and an 'else' part, the generated code looks like:
3093
3094 GROUPREF_EXISTS
3095 <group>
3096 <skipyes>
3097 ...then part...
3098 JUMP
3099 <skipno>
3100 (<skipyes> jumps here)
3101 ...else part...
3102 (<skipno> jumps here)
3103
3104 If there is only a 'then' part, it looks like:
3105
3106 GROUPREF_EXISTS
3107 <group>
3108 <skip>
3109 ...then part...
3110 (<skip> jumps here)
3111
3112 There is no direct way to decide which it is, and we don't want
3113 to allow arbitrary jumps anywhere in the code; so we just look
3114 for a JUMP opcode preceding our skip target.
3115 */
3116 if (skip >= 3 && code+skip-3 >= code &&
3117 code[skip-3] == SRE_OP_JUMP)
3118 {
3119 VTRACE(("both then and else parts present\n"));
3120 if (!_validate_inner(code+1, code+skip-3, groups))
3121 FAIL;
3122 code += skip-2; /* Position after JUMP, at <skipno> */
3123 GET_SKIP;
3124 if (!_validate_inner(code, code+skip-1, groups))
3125 FAIL;
3126 code += skip-1;
3127 }
3128 else {
3129 VTRACE(("only a then part present\n"));
3130 if (!_validate_inner(code+1, code+skip-1, groups))
3131 FAIL;
3132 code += skip-1;
3133 }
3134 break;
3135
3136 case SRE_OP_ASSERT:
3137 case SRE_OP_ASSERT_NOT:
3138 GET_SKIP;
3139 GET_ARG; /* 0 for lookahead, width for lookbehind */
3140 code--; /* Back up over arg to simplify math below */
3141 if (arg & 0x80000000)
3142 FAIL; /* Width too large */
3143 /* Stop 1 before the end; we check the SUCCESS below */
3144 if (!_validate_inner(code+1, code+skip-2, groups))
3145 FAIL;
3146 code += skip-2;
3147 GET_OP;
3148 if (op != SRE_OP_SUCCESS)
3149 FAIL;
3150 break;
3151
3152 default:
3153 FAIL;
3154
3155 }
3156 }
3157
3158 VTRACE(("okay\n"));
3159 return 1;
3160}
3161
3162static int
3163_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3164{
3165 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3166 FAIL;
3167 if (groups == 0) /* fix for simplejson */
3168 groups = 100; /* 100 groups should always be safe */
3169 return _validate_inner(code, end-1, groups);
3170}
3171
3172static int
3173_validate(PatternObject *self)
3174{
3175 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3176 {
3177 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3178 return 0;
3179 }
3180 else
3181 VTRACE(("Success!\n"));
3182 return 1;
3183}
3184
3185/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003186/* match methods */
3187
3188static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003189match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003191 Py_XDECREF(self->regs);
3192 Py_XDECREF(self->string);
3193 Py_DECREF(self->pattern);
3194 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003195}
3196
3197static PyObject*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003198match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003199{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003200 if (index < 0 || index >= self->groups) {
3201 /* raise IndexError if we were given a bad group number */
3202 PyErr_SetString(
3203 PyExc_IndexError,
3204 "no such group"
3205 );
3206 return NULL;
3207 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003208
Fredrik Lundh6f013982000-07-03 18:44:21 +00003209 index *= 2;
3210
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003211 if (self->string == Py_None || self->mark[index] < 0) {
3212 /* return default value if the string or group is undefined */
3213 Py_INCREF(def);
3214 return def;
3215 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003216
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003217 return PySequence_GetSlice(
3218 self->string, self->mark[index], self->mark[index+1]
3219 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003220}
3221
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003222static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003223match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003224{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003225 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003226
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003227 if (PyInt_Check(index))
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003228 return PyInt_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003229
Fredrik Lundh6f013982000-07-03 18:44:21 +00003230 i = -1;
3231
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003232 if (self->pattern->groupindex) {
3233 index = PyObject_GetItem(self->pattern->groupindex, index);
3234 if (index) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003235 if (PyInt_Check(index) || PyLong_Check(index))
3236 i = PyInt_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003237 Py_DECREF(index);
3238 } else
3239 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003240 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003241
3242 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003243}
3244
3245static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003246match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003247{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003248 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003249}
3250
3251static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003252match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003253{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003254 /* delegate to Python code */
3255 return call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00003256 SRE_PY_MODULE, "_expand",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003257 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003258 );
3259}
3260
3261static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003262match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003263{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003264 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003265 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003266
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003267 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003268
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003269 switch (size) {
3270 case 0:
3271 result = match_getslice(self, Py_False, Py_None);
3272 break;
3273 case 1:
3274 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3275 break;
3276 default:
3277 /* fetch multiple items */
3278 result = PyTuple_New(size);
3279 if (!result)
3280 return NULL;
3281 for (i = 0; i < size; i++) {
3282 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003283 self, PyTuple_GET_ITEM(args, i), Py_None
3284 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003285 if (!item) {
3286 Py_DECREF(result);
3287 return NULL;
3288 }
3289 PyTuple_SET_ITEM(result, i, item);
3290 }
3291 break;
3292 }
3293 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003294}
3295
3296static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003297match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003298{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003299 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003300 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003301
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003302 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003303 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003304 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003305 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003306
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003307 result = PyTuple_New(self->groups-1);
3308 if (!result)
3309 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 for (index = 1; index < self->groups; index++) {
3312 PyObject* item;
3313 item = match_getslice_by_index(self, index, def);
3314 if (!item) {
3315 Py_DECREF(result);
3316 return NULL;
3317 }
3318 PyTuple_SET_ITEM(result, index-1, item);
3319 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003320
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003321 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003322}
3323
3324static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003325match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003326{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003327 PyObject* result;
3328 PyObject* keys;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003329 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003330
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003331 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003332 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003333 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003334 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003336 result = PyDict_New();
3337 if (!result || !self->pattern->groupindex)
3338 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003341 if (!keys)
3342 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003345 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003346 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003347 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003349 if (!key)
3350 goto failed;
3351 value = match_getslice(self, key, def);
3352 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003353 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003354 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003355 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003356 status = PyDict_SetItem(result, key, value);
3357 Py_DECREF(value);
3358 if (status < 0)
3359 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003360 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003361
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003362 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003363
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003364 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003365
3366failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003367 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003368 Py_DECREF(result);
3369 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003370}
3371
3372static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003373match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003374{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003375 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003376
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003377 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003378 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003379 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003380
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003381 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003382
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003383 if (index < 0 || index >= self->groups) {
3384 PyErr_SetString(
3385 PyExc_IndexError,
3386 "no such group"
3387 );
3388 return NULL;
3389 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003390
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003391 /* mark is -1 if group is undefined */
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003392 return PyInt_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003393}
3394
3395static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003396match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003397{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003398 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003399
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003400 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003401 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003402 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003403
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003404 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003405
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003406 if (index < 0 || index >= self->groups) {
3407 PyErr_SetString(
3408 PyExc_IndexError,
3409 "no such group"
3410 );
3411 return NULL;
3412 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003413
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003414 /* mark is -1 if group is undefined */
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003415 return PyInt_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003416}
3417
3418LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003419_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003420{
3421 PyObject* pair;
3422 PyObject* item;
3423
3424 pair = PyTuple_New(2);
3425 if (!pair)
3426 return NULL;
3427
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003428 item = PyInt_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003429 if (!item)
3430 goto error;
3431 PyTuple_SET_ITEM(pair, 0, item);
3432
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003433 item = PyInt_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003434 if (!item)
3435 goto error;
3436 PyTuple_SET_ITEM(pair, 1, item);
3437
3438 return pair;
3439
3440 error:
3441 Py_DECREF(pair);
3442 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003443}
3444
3445static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003446match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003447{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003448 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003449
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003450 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003451 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003452 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003453
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003454 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003455
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003456 if (index < 0 || index >= self->groups) {
3457 PyErr_SetString(
3458 PyExc_IndexError,
3459 "no such group"
3460 );
3461 return NULL;
3462 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003463
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003464 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003465 return _pair(self->mark[index*2], self->mark[index*2+1]);
3466}
3467
3468static PyObject*
3469match_regs(MatchObject* self)
3470{
3471 PyObject* regs;
3472 PyObject* item;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003473 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003474
3475 regs = PyTuple_New(self->groups);
3476 if (!regs)
3477 return NULL;
3478
3479 for (index = 0; index < self->groups; index++) {
3480 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3481 if (!item) {
3482 Py_DECREF(regs);
3483 return NULL;
3484 }
3485 PyTuple_SET_ITEM(regs, index, item);
3486 }
3487
3488 Py_INCREF(regs);
3489 self->regs = regs;
3490
3491 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003492}
3493
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003494static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003495match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003496{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003497#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003498 MatchObject* copy;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003499 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003500
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003501 slots = 2 * (self->pattern->groups+1);
3502
3503 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3504 if (!copy)
3505 return NULL;
3506
3507 /* this value a constant, but any compiler should be able to
3508 figure that out all by itself */
3509 offset = offsetof(MatchObject, string);
3510
3511 Py_XINCREF(self->pattern);
3512 Py_XINCREF(self->string);
3513 Py_XINCREF(self->regs);
3514
3515 memcpy((char*) copy + offset, (char*) self + offset,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003516 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003517
3518 return (PyObject*) copy;
3519#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003520 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003521 return NULL;
3522#endif
3523}
3524
3525static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003526match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003527{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003528#ifdef USE_BUILTIN_COPY
3529 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003530
Georg Brandlfbef5882006-05-28 22:14:04 +00003531 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003532 if (!copy)
3533 return NULL;
3534
3535 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3536 !deepcopy(&copy->string, memo) ||
3537 !deepcopy(&copy->regs, memo)) {
3538 Py_DECREF(copy);
3539 return NULL;
3540 }
3541
3542#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003543 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3544 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003545#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003546}
3547
Andrew Svetlov1c6c90f2012-12-23 20:09:01 +02003548PyDoc_STRVAR(match_doc,
3549"The result of re.match() and re.search().\n\
3550Match objects always have a boolean value of True.");
3551
3552PyDoc_STRVAR(match_group_doc,
3553"group([group1, ...]) -> str or tuple.\n\
3554 Return subgroup(s) of the match by indices or names.\n\
3555 For 0 returns the entire match.");
3556
3557PyDoc_STRVAR(match_start_doc,
3558"start([group=0]) -> int.\n\
3559 Return index of the start of the substring matched by group.");
3560
3561PyDoc_STRVAR(match_end_doc,
3562"end([group=0]) -> int.\n\
3563 Return index of the end of the substring matched by group.");
3564
3565PyDoc_STRVAR(match_span_doc,
3566"span([group]) -> tuple.\n\
3567 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3568
3569PyDoc_STRVAR(match_groups_doc,
3570"groups([default=None]) -> tuple.\n\
3571 Return a tuple containing all the subgroups of the match, from 1.\n\
3572 The default argument is used for groups\n\
3573 that did not participate in the match");
3574
3575PyDoc_STRVAR(match_groupdict_doc,
3576"groupdict([default=None]) -> dict.\n\
3577 Return a dictionary containing all the named subgroups of the match,\n\
3578 keyed by the subgroup name. The default argument is used for groups\n\
3579 that did not participate in the match");
3580
3581PyDoc_STRVAR(match_expand_doc,
3582"expand(template) -> str.\n\
3583 Return the string obtained by doing backslash substitution\n\
3584 on the string template, as done by the sub() method.");
3585
3586static PyMethodDef match_methods[] = {
3587 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3588 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3589 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3590 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3591 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3592 match_groups_doc},
3593 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3594 match_groupdict_doc},
3595 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Georg Brandlfbef5882006-05-28 22:14:04 +00003596 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3597 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003598 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003599};
3600
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003601static PyObject *
3602match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003603{
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003604 if (self->lastindex >= 0)
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003605 return PyInt_FromSsize_t(self->lastindex);
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003606 Py_INCREF(Py_None);
3607 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003608}
3609
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003610static PyObject *
3611match_lastgroup_get(MatchObject *self)
3612{
3613 if (self->pattern->indexgroup && self->lastindex >= 0) {
3614 PyObject* result = PySequence_GetItem(
3615 self->pattern->indexgroup, self->lastindex
3616 );
3617 if (result)
3618 return result;
3619 PyErr_Clear();
3620 }
3621 Py_INCREF(Py_None);
3622 return Py_None;
3623}
3624
3625static PyObject *
3626match_regs_get(MatchObject *self)
3627{
3628 if (self->regs) {
3629 Py_INCREF(self->regs);
3630 return self->regs;
3631 } else
3632 return match_regs(self);
3633}
3634
3635static PyGetSetDef match_getset[] = {
3636 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3637 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3638 {"regs", (getter)match_regs_get, (setter)NULL},
3639 {NULL}
3640};
3641
3642#define MATCH_OFF(x) offsetof(MatchObject, x)
3643static PyMemberDef match_members[] = {
3644 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3645 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3646 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3647 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3648 {NULL}
3649};
3650
3651
Guido van Rossumb700df92000-03-31 14:59:30 +00003652/* FIXME: implement setattr("string", None) as a special case (to
3653 detach the associated string, if any */
3654
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003655static PyTypeObject Match_Type = {
3656 PyVarObject_HEAD_INIT(NULL, 0)
3657 "_" SRE_MODULE ".SRE_Match",
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003658 sizeof(MatchObject), sizeof(Py_ssize_t),
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003659 (destructor)match_dealloc, /* tp_dealloc */
3660 0, /* tp_print */
3661 0, /* tp_getattr */
3662 0, /* tp_setattr */
3663 0, /* tp_compare */
3664 0, /* tp_repr */
3665 0, /* tp_as_number */
3666 0, /* tp_as_sequence */
3667 0, /* tp_as_mapping */
3668 0, /* tp_hash */
3669 0, /* tp_call */
3670 0, /* tp_str */
3671 0, /* tp_getattro */
3672 0, /* tp_setattro */
3673 0, /* tp_as_buffer */
3674 Py_TPFLAGS_DEFAULT,
Andrew Svetlov1c6c90f2012-12-23 20:09:01 +02003675 match_doc, /* tp_doc */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003676 0, /* tp_traverse */
3677 0, /* tp_clear */
3678 0, /* tp_richcompare */
3679 0, /* tp_weaklistoffset */
3680 0, /* tp_iter */
3681 0, /* tp_iternext */
3682 match_methods, /* tp_methods */
3683 match_members, /* tp_members */
3684 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003685};
3686
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003687static PyObject*
3688pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3689{
3690 /* create match object (from state object) */
3691
3692 MatchObject* match;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003693 Py_ssize_t i, j;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003694 char* base;
3695 int n;
3696
3697 if (status > 0) {
3698
3699 /* create match object (with room for extra group marks) */
Christian Heimes4956d2b2008-01-18 19:12:56 +00003700 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003701 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3702 2*(pattern->groups+1));
3703 if (!match)
3704 return NULL;
3705
3706 Py_INCREF(pattern);
3707 match->pattern = pattern;
3708
3709 Py_INCREF(state->string);
3710 match->string = state->string;
3711
3712 match->regs = NULL;
3713 match->groups = pattern->groups+1;
3714
3715 /* fill in group slices */
3716
3717 base = (char*) state->beginning;
3718 n = state->charsize;
3719
3720 match->mark[0] = ((char*) state->start - base) / n;
3721 match->mark[1] = ((char*) state->ptr - base) / n;
3722
3723 for (i = j = 0; i < pattern->groups; i++, j+=2)
3724 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3725 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3726 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3727 } else
3728 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3729
3730 match->pos = state->pos;
3731 match->endpos = state->endpos;
3732
3733 match->lastindex = state->lastindex;
3734
3735 return (PyObject*) match;
3736
3737 } else if (status == 0) {
3738
3739 /* no match */
3740 Py_INCREF(Py_None);
3741 return Py_None;
3742
3743 }
3744
3745 /* internal error */
3746 pattern_error(status);
3747 return NULL;
3748}
3749
3750
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003751/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003752/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003753
3754static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003755scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003756{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003757 state_fini(&self->state);
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003758 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003759 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003760}
3761
3762static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003763scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003764{
3765 SRE_STATE* state = &self->state;
3766 PyObject* match;
3767 int status;
3768
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003769 state_reset(state);
3770
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003771 state->ptr = state->start;
3772
3773 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003774 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003775 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003776#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003777 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003778#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003779 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003780 if (PyErr_Occurred())
3781 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003782
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003783 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003784 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003785
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003786 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003787 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003788 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003789 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003790
3791 return match;
3792}
3793
3794
3795static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003796scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003797{
3798 SRE_STATE* state = &self->state;
3799 PyObject* match;
3800 int status;
3801
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003802 state_reset(state);
3803
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003804 state->ptr = state->start;
3805
3806 if (state->charsize == 1) {
3807 status = sre_search(state, PatternObject_GetCode(self->pattern));
3808 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003809#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003810 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003811#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003812 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003813 if (PyErr_Occurred())
3814 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003815
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003816 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003817 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003818
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003819 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003820 state->start = (void*) ((char*) state->ptr + state->charsize);
3821 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003822 state->start = state->ptr;
3823
3824 return match;
3825}
3826
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003827static PyMethodDef scanner_methods[] = {
Georg Brandlfbef5882006-05-28 22:14:04 +00003828 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3829 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003830 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003831};
3832
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003833#define SCAN_OFF(x) offsetof(ScannerObject, x)
3834static PyMemberDef scanner_members[] = {
3835 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3836 {NULL} /* Sentinel */
3837};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003838
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003839statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003840 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003841 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003842 sizeof(ScannerObject), 0,
3843 (destructor)scanner_dealloc, /*tp_dealloc*/
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003844 0, /* tp_print */
3845 0, /* tp_getattr */
3846 0, /* tp_setattr */
3847 0, /* tp_reserved */
3848 0, /* tp_repr */
3849 0, /* tp_as_number */
3850 0, /* tp_as_sequence */
3851 0, /* tp_as_mapping */
3852 0, /* tp_hash */
3853 0, /* tp_call */
3854 0, /* tp_str */
3855 0, /* tp_getattro */
3856 0, /* tp_setattro */
3857 0, /* tp_as_buffer */
3858 Py_TPFLAGS_DEFAULT, /* tp_flags */
3859 0, /* tp_doc */
3860 0, /* tp_traverse */
3861 0, /* tp_clear */
3862 0, /* tp_richcompare */
3863 0, /* tp_weaklistoffset */
3864 0, /* tp_iter */
3865 0, /* tp_iternext */
3866 scanner_methods, /* tp_methods */
3867 scanner_members, /* tp_members */
3868 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003869};
3870
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003871static PyObject*
3872pattern_scanner(PatternObject* pattern, PyObject* args)
3873{
3874 /* create search state object */
3875
3876 ScannerObject* self;
3877
3878 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003879 Py_ssize_t start = 0;
3880 Py_ssize_t end = PY_SSIZE_T_MAX;
3881 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003882 return NULL;
3883
3884 /* create scanner object */
3885 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3886 if (!self)
3887 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003888 self->pattern = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003889
3890 string = state_init(&self->state, pattern, string, start, end);
3891 if (!string) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003892 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003893 return NULL;
3894 }
3895
3896 Py_INCREF(pattern);
3897 self->pattern = (PyObject*) pattern;
3898
3899 return (PyObject*) self;
3900}
3901
Guido van Rossumb700df92000-03-31 14:59:30 +00003902static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003903 {"compile", _compile, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00003904 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003905 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003906 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003907};
3908
Tim Peters3d563502006-01-21 02:47:53 +00003909#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003910DL_EXPORT(void) init_sre(void)
3911#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003912PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003913#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003914{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003915 PyObject* m;
3916 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003917 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003918
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003919 /* Patch object types */
Benjamin Petersone266d3e2010-04-06 03:34:09 +00003920 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3921 PyType_Ready(&Scanner_Type))
3922 return;
Guido van Rossumb700df92000-03-31 14:59:30 +00003923
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003924 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003925 if (m == NULL)
3926 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003927 d = PyModule_GetDict(m);
3928
Fredrik Lundh21009b92001-09-18 18:47:09 +00003929 x = PyInt_FromLong(SRE_MAGIC);
3930 if (x) {
3931 PyDict_SetItemString(d, "MAGIC", x);
3932 Py_DECREF(x);
3933 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003934
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003935 x = PyInt_FromLong(sizeof(SRE_CODE));
3936 if (x) {
3937 PyDict_SetItemString(d, "CODESIZE", x);
3938 Py_DECREF(x);
3939 }
3940
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003941 x = PyString_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003942 if (x) {
3943 PyDict_SetItemString(d, "copyright", x);
3944 Py_DECREF(x);
3945 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003946}
3947
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003948#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003949
3950/* vim:ts=4:sw=4:et
3951*/