blob: f3a29931a7a01fd3ea0ce090f71bf712bca90b29 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Neal Norwitza6d80fa2006-06-12 03:05:40 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d52000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000055
Neal Norwitz94a9c092006-03-16 06:30:02 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#if PY_VERSION_HEX >= 0x01060000
62#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000063/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000064#define HAVE_UNICODE
65#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000066#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070
71/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000072#define USE_FAST_SEARCH
73
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000075#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000076
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000077/* enables copy/deepcopy handling (work in progress) */
78#undef USE_BUILTIN_COPY
79
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000080#if PY_VERSION_HEX < 0x01060000
81#define PyObject_DEL(op) PyMem_DEL((op))
82#endif
83
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084/* -------------------------------------------------------------------- */
85
Fredrik Lundh80946112000-06-29 18:03:25 +000086#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000087#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000088#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000089/* fastest possible local call under MSVC */
90#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000092#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#else
94#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000095#endif
96
97/* error codes */
98#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000099#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000100#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define SRE_ERROR_MEMORY -9 /* out of memory */
Facundo Batista4473d222008-01-08 21:10:12 +0000102#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +0000103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000104#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000105#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000106#else
107#define TRACE(v)
108#endif
109
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000110/* -------------------------------------------------------------------- */
111/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000112
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000113/* default character predicates (run sre_chars.py to regenerate tables) */
114
115#define SRE_DIGIT_MASK 1
116#define SRE_SPACE_MASK 2
117#define SRE_LINEBREAK_MASK 4
118#define SRE_ALNUM_MASK 8
119#define SRE_WORD_MASK 16
120
Fredrik Lundh21009b92001-09-18 18:47:09 +0000121/* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000123static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1242, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1280, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
Fredrik Lundhb389df32000-06-29 12:48:37 +0000131static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000013210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139120, 121, 122, 123, 124, 125, 126, 127 };
140
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000141#define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143#define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145#define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147#define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149#define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000151
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000152static unsigned int sre_lower(unsigned int ch)
153{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155}
156
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000162#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000164#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166static unsigned int sre_lower_locale(unsigned int ch)
167{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169}
170
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000171/* unicode-specific character predicates */
172
173#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000174
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000175#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000176#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000178#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000179#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000180
181static unsigned int sre_lower_unicode(unsigned int ch)
182{
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184}
185
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000186#endif
187
Guido van Rossumb700df92000-03-31 14:59:30 +0000188LOCAL(int)
189sre_category(SRE_CODE category, unsigned int ch)
190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000191 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000209
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000214
215#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000232#else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000249#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 }
251 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000252}
253
254/* helpers */
255
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000256static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000258{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 if (state->data_stack) {
Jack Diederich2d400772006-05-27 15:44:34 +0000260 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000261 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264}
265
266static int
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000267data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000268{
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000269 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
275 TRACE(("allocate/grow stack %d\n", cursize));
Jack Diederich2d400772006-05-27 15:44:34 +0000276 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000277 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279 return SRE_ERROR_MEMORY;
280 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000281 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000282 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000283 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000284 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000285}
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#define SRE_CHAR unsigned char
290#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_charset
293#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000297#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000298
299#if defined(HAVE_UNICODE)
300
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000303#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000304
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000305#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000306#undef SRE_SEARCH
307#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000308#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000309#undef SRE_INFO
310#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000311#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000312#undef SRE_AT
313#undef SRE_CHAR
314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000316
317#define SRE_CHAR Py_UNICODE
318#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000319#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000320#define SRE_CHARSET sre_ucharset
321#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000322#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000323#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000324#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000325#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000326#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000327
328#endif /* SRE_RECURSIVE */
329
330/* -------------------------------------------------------------------- */
331/* String matching engine */
332
333/* the following section is compiled twice, with different character
334 settings */
335
336LOCAL(int)
337SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000340
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000341 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000346 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000351 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END:
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000354 return (((void*) (ptr+1) == state->end &&
355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000360 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000361
Fredrik Lundh770617b2001-01-14 15:06:11 +0000362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000368 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000370 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000372 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000377 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000378 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000379 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000380 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000381 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000386 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000388 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000390 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000395 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000397 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000399 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000400
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000401#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000405 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000407 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000409 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000414 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000416 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000418 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000419#endif
420
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000421 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000422
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000423 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000424}
425
426LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000427SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000428{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000431 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000433 for (;;) {
434 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_FAILURE:
437 return !ok;
438
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000439 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000440 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000449 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000450 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000451 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000452
Fredrik Lundh3562f112000-07-02 12:00:07 +0000453 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000459 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith64ab35e2012-12-10 17:45:54 -0800462 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000463 return ok;
464 set += 8;
465 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000466 break;
467
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000482 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000483 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000500 if (block >=0 &&
Gregory P. Smith64ab35e2012-12-10 17:45:54 -0800501 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000502 return ok;
503 set += count*8;
504 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000505 break;
506 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000507
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000508 default:
509 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000510 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000511 return 0;
512 }
513 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000514}
515
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000516LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000518LOCAL(Py_ssize_t)
519SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000520{
521 SRE_CODE chr;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000524 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000525
526 /* adjust end */
Serhiy Storchakae18e05c2013-02-16 16:47:15 +0200527 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000547 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
Tim Peters3d563502006-01-21 02:47:53 +0000576
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 default:
586 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000588 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000589 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000595 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
596 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000600 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000601 return ptr - (SRE_CHAR*) state->ptr;
602}
603
Fredrik Lundh33accc12000-08-27 20:59:47 +0000604#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000605LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000606SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607{
608 /* check if an SRE_OP_INFO block matches at the current position.
609 returns the number of SRE_CODE objects to skip if successful, 0
610 if no match */
611
612 SRE_CHAR* end = state->end;
613 SRE_CHAR* ptr = state->ptr;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000614 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000615
616 /* check minimal length */
617 if (pattern[3] && (end - ptr) < pattern[3])
618 return 0;
619
620 /* check known prefix */
621 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
622 /* <length> <skip> <prefix data> <overlap data> */
623 for (i = 0; i < pattern[5]; i++)
624 if ((SRE_CODE) ptr[i] != pattern[7 + i])
625 return 0;
626 return pattern[0] + 2 * pattern[6];
627 }
628 return pattern[0];
629}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000630#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000631
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000632/* The macros below should be used to protect recursive SRE_MATCH()
633 * calls that *failed* and do *not* return immediately (IOW, those
634 * that will backtrack). Explaining:
635 *
636 * - Recursive SRE_MATCH() returned true: that's usually a success
637 * (besides atypical cases like ASSERT_NOT), therefore there's no
638 * reason to restore lastmark;
639 *
640 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
641 * is returning to the caller: If the current SRE_MATCH() is the
642 * top function of the recursion, returning false will be a matching
643 * failure, and it doesn't matter where lastmark is pointing to.
644 * If it's *not* the top function, it will be a recursive SRE_MATCH()
645 * failure by itself, and the calling SRE_MATCH() will have to deal
646 * with the failure by the same rules explained here (it will restore
647 * lastmark by itself if necessary);
648 *
649 * - Recursive SRE_MATCH() returned false, and will continue the
650 * outside 'for' loop: must be protected when breaking, since the next
651 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000652 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000653 * - Recursive SRE_MATCH() returned false, and will be called again
654 * inside a local for/while loop: must be protected between each
655 * loop iteration, since the recursive SRE_MATCH() could do anything,
656 * and could potentially depend on lastmark.
657 *
658 * For more information, check the discussion at SF patch #712900.
659 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000660#define LASTMARK_SAVE() \
661 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000662 ctx->lastmark = state->lastmark; \
663 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000664 } while (0)
665#define LASTMARK_RESTORE() \
666 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000667 state->lastmark = ctx->lastmark; \
668 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000669 } while (0)
670
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000671#define RETURN_ERROR(i) do { return i; } while(0)
672#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
673#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674
675#define RETURN_ON_ERROR(i) \
676 do { if (i < 0) RETURN_ERROR(i); } while (0)
677#define RETURN_ON_SUCCESS(i) \
678 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
679#define RETURN_ON_FAILURE(i) \
680 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681
682#define SFY(x) #x
683
684#define DATA_STACK_ALLOC(state, type, ptr) \
685do { \
686 alloc_pos = state->data_stack_base; \
687 TRACE(("allocating %s in %d (%d)\n", \
688 SFY(type), alloc_pos, sizeof(type))); \
689 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
690 int j = data_stack_grow(state, sizeof(type)); \
691 if (j < 0) return j; \
692 if (ctx_pos != -1) \
693 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 } \
695 ptr = (type*)(state->data_stack+alloc_pos); \
696 state->data_stack_base += sizeof(type); \
697} while (0)
698
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000699#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
700do { \
701 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
702 ptr = (type*)(state->data_stack+pos); \
703} while (0)
704
705#define DATA_STACK_PUSH(state, data, size) \
706do { \
707 TRACE(("copy data in %p to %d (%d)\n", \
708 data, state->data_stack_base, size)); \
709 if (state->data_stack_size < state->data_stack_base+size) { \
710 int j = data_stack_grow(state, size); \
711 if (j < 0) return j; \
712 if (ctx_pos != -1) \
713 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 } \
715 memcpy(state->data_stack+state->data_stack_base, data, size); \
716 state->data_stack_base += size; \
717} while (0)
718
719#define DATA_STACK_POP(state, data, size, discard) \
720do { \
721 TRACE(("copy data to %p from %d (%d)\n", \
722 data, state->data_stack_base-size, size)); \
723 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
724 if (discard) \
725 state->data_stack_base -= size; \
726} while (0)
727
728#define DATA_STACK_POP_DISCARD(state, size) \
729do { \
730 TRACE(("discard data from %d (%d)\n", \
731 state->data_stack_base-size, size)); \
732 state->data_stack_base -= size; \
733} while(0)
734
735#define DATA_PUSH(x) \
736 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
737#define DATA_POP(x) \
738 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000739#define DATA_POP_DISCARD(x) \
740 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
741#define DATA_ALLOC(t,p) \
742 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000743#define DATA_LOOKUP_AT(t,p,pos) \
744 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745
746#define MARK_PUSH(lastmark) \
747 do if (lastmark > 0) { \
748 i = lastmark; /* ctx->lastmark may change if reallocated */ \
749 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
750 } while (0)
751#define MARK_POP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
754 } while (0)
755#define MARK_POP_KEEP(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
758 } while (0)
759#define MARK_POP_DISCARD(lastmark) \
760 do if (lastmark > 0) { \
761 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
762 } while (0)
763
764#define JUMP_NONE 0
765#define JUMP_MAX_UNTIL_1 1
766#define JUMP_MAX_UNTIL_2 2
767#define JUMP_MAX_UNTIL_3 3
768#define JUMP_MIN_UNTIL_1 4
769#define JUMP_MIN_UNTIL_2 5
770#define JUMP_MIN_UNTIL_3 6
771#define JUMP_REPEAT 7
772#define JUMP_REPEAT_ONE_1 8
773#define JUMP_REPEAT_ONE_2 9
774#define JUMP_MIN_REPEAT_ONE 10
775#define JUMP_BRANCH 11
776#define JUMP_ASSERT 12
777#define JUMP_ASSERT_NOT 13
778
779#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
780 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
781 nextctx->last_ctx_pos = ctx_pos; \
782 nextctx->jump = jumpvalue; \
783 nextctx->pattern = nextpattern; \
784 ctx_pos = alloc_pos; \
785 ctx = nextctx; \
786 goto entrance; \
787 jumplabel: \
788 while (0) /* gcc doesn't like labels at end of scopes */ \
789
790typedef struct {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000791 Py_ssize_t last_ctx_pos;
792 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000793 SRE_CHAR* ptr;
794 SRE_CODE* pattern;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000795 Py_ssize_t count;
796 Py_ssize_t lastmark;
797 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 union {
799 SRE_CODE chr;
800 SRE_REPEAT* rep;
801 } u;
802} SRE_MATCH_CONTEXT;
803
804/* check if string matches the given pattern. returns <0 for
805 error, 0 for failure, and 1 for success */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000806LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000807SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000808{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000809 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000810 Py_ssize_t alloc_pos, ctx_pos = -1;
811 Py_ssize_t i, ret = 0;
812 Py_ssize_t jump;
Facundo Batista4473d222008-01-08 21:10:12 +0000813 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000815 SRE_MATCH_CONTEXT* ctx;
816 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000817
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000818 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
821 ctx->last_ctx_pos = -1;
822 ctx->jump = JUMP_NONE;
823 ctx->pattern = pattern;
824 ctx_pos = alloc_pos;
825
826entrance:
827
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000828 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829
830 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000831 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000832 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000833 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000834 TRACE(("reject (got %d chars, need %d)\n",
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000835 (end - ctx->ptr), ctx->pattern[3]));
836 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000837 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000839 }
840
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000841 for (;;) {
Facundo Batista4473d222008-01-08 21:10:12 +0000842 ++sigcount;
843 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
844 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000845
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000846 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000847
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000848 case SRE_OP_MARK:
849 /* set mark */
850 /* <MARK> <gid> */
851 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
852 ctx->ptr, ctx->pattern[0]));
853 i = ctx->pattern[0];
854 if (i & 1)
855 state->lastindex = i/2 + 1;
856 if (i > state->lastmark) {
857 /* state->lastmark is the highest valid index in the
858 state->mark array. If it is increased by more than 1,
859 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000860 that these marks have not been encountered. */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000861 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000862 while (j < i)
863 state->mark[j++] = NULL;
864 state->lastmark = i;
865 }
866 state->mark[i] = ctx->ptr;
867 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000868 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000869
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 case SRE_OP_LITERAL:
871 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000872 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000873 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
874 ctx->ptr, *ctx->pattern));
875 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
876 RETURN_FAILURE;
877 ctx->pattern++;
878 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000879 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000880
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000881 case SRE_OP_NOT_LITERAL:
882 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000883 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000884 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
885 ctx->ptr, *ctx->pattern));
886 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
887 RETURN_FAILURE;
888 ctx->pattern++;
889 ctx->ptr++;
890 break;
891
892 case SRE_OP_SUCCESS:
893 /* end of pattern */
894 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
895 state->ptr = ctx->ptr;
896 RETURN_SUCCESS;
897
898 case SRE_OP_AT:
899 /* match at given position */
900 /* <AT> <code> */
901 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
902 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
903 RETURN_FAILURE;
904 ctx->pattern++;
905 break;
906
907 case SRE_OP_CATEGORY:
908 /* match at given category */
909 /* <CATEGORY> <code> */
910 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
911 ctx->ptr, *ctx->pattern));
912 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
913 RETURN_FAILURE;
914 ctx->pattern++;
915 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000916 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000917
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000918 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000919 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000920 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000921 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
922 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
923 RETURN_FAILURE;
924 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000925 break;
926
927 case SRE_OP_ANY_ALL:
928 /* match anything */
929 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
931 if (ctx->ptr >= end)
932 RETURN_FAILURE;
933 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000934 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000935
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000936 case SRE_OP_IN:
937 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000938 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000939 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
940 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
941 RETURN_FAILURE;
942 ctx->pattern += ctx->pattern[0];
943 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000944 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000945
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000946 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000947 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
948 ctx->pattern, ctx->ptr, ctx->pattern[0]));
949 if (ctx->ptr >= end ||
950 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
951 RETURN_FAILURE;
952 ctx->pattern++;
953 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000954 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000955
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000956 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000957 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
958 ctx->pattern, ctx->ptr, *ctx->pattern));
959 if (ctx->ptr >= end ||
960 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
961 RETURN_FAILURE;
962 ctx->pattern++;
963 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000964 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000966 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000967 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
968 if (ctx->ptr >= end
969 || !SRE_CHARSET(ctx->pattern+1,
970 (SRE_CODE)state->lower(*ctx->ptr)))
971 RETURN_FAILURE;
972 ctx->pattern += ctx->pattern[0];
973 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000974 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000975
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000976 case SRE_OP_JUMP:
977 case SRE_OP_INFO:
978 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000979 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000980 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
981 ctx->ptr, ctx->pattern[0]));
982 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000983 break;
984
985 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000986 /* alternation */
987 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000988 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000989 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000990 ctx->u.rep = state->repeat;
991 if (ctx->u.rep)
992 MARK_PUSH(ctx->lastmark);
993 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
994 if (ctx->pattern[1] == SRE_OP_LITERAL &&
995 (ctx->ptr >= end ||
996 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000997 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000998 if (ctx->pattern[1] == SRE_OP_IN &&
999 (ctx->ptr >= end ||
1000 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001001 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001002 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 if (ret) {
1005 if (ctx->u.rep)
1006 MARK_POP_DISCARD(ctx->lastmark);
1007 RETURN_ON_ERROR(ret);
1008 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001009 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ctx->u.rep)
1011 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001012 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001013 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001014 if (ctx->u.rep)
1015 MARK_POP_DISCARD(ctx->lastmark);
1016 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001017
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018 case SRE_OP_REPEAT_ONE:
1019 /* match repeated sequence (maximizing regexp) */
1020
1021 /* this operator only works if the repeated item is
1022 exactly one character wide, and we're not already
1023 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001024 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001025
1026 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1027
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001028 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1029 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001030
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001031 if (ctx->ptr + ctx->pattern[1] > end)
1032 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001035
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001036 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1037 RETURN_ON_ERROR(ret);
1038 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1039 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001041
1042 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001044 string. check if the rest of the pattern matches,
1045 and backtrack if not. */
1046
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001047 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001048 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001049
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001050 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001051 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001052 state->ptr = ctx->ptr;
1053 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001054 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001056 LASTMARK_SAVE();
1057
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001059 /* tail starts with a literal. skip positions where
1060 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001062 for (;;) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001063 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1065 ctx->ptr--;
1066 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001067 }
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001068 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001069 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1072 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001073 if (ret) {
1074 RETURN_ON_ERROR(ret);
1075 RETURN_SUCCESS;
1076 }
Tim Peters3d563502006-01-21 02:47:53 +00001077
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001078 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001079
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 ctx->ptr--;
1081 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001082 }
1083
1084 } else {
1085 /* general case */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001086 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001087 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001088 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1089 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001090 if (ret) {
1091 RETURN_ON_ERROR(ret);
1092 RETURN_SUCCESS;
1093 }
1094 ctx->ptr--;
1095 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001096 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001097 }
1098 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001099 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001100
Guido van Rossum41c99e72003-04-14 17:59:34 +00001101 case SRE_OP_MIN_REPEAT_ONE:
1102 /* match repeated sequence (minimizing regexp) */
1103
1104 /* this operator only works if the repeated item is
1105 exactly one character wide, and we're not already
1106 collecting backtracking points. for other cases,
1107 use the MIN_REPEAT operator */
1108
1109 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1110
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001111 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1112 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001113
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001114 if (ctx->ptr + ctx->pattern[1] > end)
1115 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001116
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001118
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001119 if (ctx->pattern[1] == 0)
1120 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001121 else {
1122 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001123 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1124 RETURN_ON_ERROR(ret);
1125 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001126 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001127 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001128 RETURN_FAILURE;
1129 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001130 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001131 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001132 }
1133
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001135 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001136 state->ptr = ctx->ptr;
1137 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001138
1139 } else {
1140 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001141 LASTMARK_SAVE();
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001142 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001143 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001144 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001145 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1146 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001147 if (ret) {
1148 RETURN_ON_ERROR(ret);
1149 RETURN_SUCCESS;
1150 }
1151 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001152 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001154 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001156 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001157 assert(ret == 1);
1158 ctx->ptr++;
1159 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001160 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001161 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001162 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001163 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001164
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001165 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001166 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001167 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001168 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1170 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001171
1172 /* install new repeat context */
Jack Diederich2d400772006-05-27 15:44:34 +00001173 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001174 if (!ctx->u.rep) {
1175 PyErr_NoMemory();
Neal Norwitzef0de022006-08-12 01:53:28 +00001176 RETURN_FAILURE;
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001177 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 ctx->u.rep->count = -1;
1179 ctx->u.rep->pattern = ctx->pattern;
1180 ctx->u.rep->prev = state->repeat;
1181 ctx->u.rep->last_ptr = NULL;
1182 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001183
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 state->repeat = ctx->u.rep->prev;
Jack Diederich2d400772006-05-27 15:44:34 +00001187 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001188
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 if (ret) {
1190 RETURN_ON_ERROR(ret);
1191 RETURN_SUCCESS;
1192 }
1193 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
1195 case SRE_OP_MAX_UNTIL:
1196 /* maximizing repeat */
1197 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1198
1199 /* FIXME: we probably need to deal with zero-width
1200 matches in here... */
1201
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep = state->repeat;
1203 if (!ctx->u.rep)
1204 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001205
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001206 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001207
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001209
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001210 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1211 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001212
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001213 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001214 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001215 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001216 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1217 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 if (ret) {
1219 RETURN_ON_ERROR(ret);
1220 RETURN_SUCCESS;
1221 }
1222 ctx->u.rep->count = ctx->count-1;
1223 state->ptr = ctx->ptr;
1224 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001225 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001226
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001227 if ((ctx->count < ctx->u.rep->pattern[2] ||
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001228 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001229 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001230 /* we may have enough matches, but if we can
1231 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001232 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001233 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001234 MARK_PUSH(ctx->lastmark);
1235 /* zero-width match protection */
1236 DATA_PUSH(&ctx->u.rep->last_ptr);
1237 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1239 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001240 DATA_POP(&ctx->u.rep->last_ptr);
1241 if (ret) {
1242 MARK_POP_DISCARD(ctx->lastmark);
1243 RETURN_ON_ERROR(ret);
1244 RETURN_SUCCESS;
1245 }
1246 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001247 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->u.rep->count = ctx->count-1;
1249 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001250 }
1251
1252 /* cannot match more repeated items here. make sure the
1253 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001255 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001256 RETURN_ON_SUCCESS(ret);
1257 state->repeat = ctx->u.rep;
1258 state->ptr = ctx->ptr;
1259 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001260
1261 case SRE_OP_MIN_UNTIL:
1262 /* minimizing repeat */
1263 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1264
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 ctx->u.rep = state->repeat;
1266 if (!ctx->u.rep)
1267 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001268
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001269 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001270
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001272
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001273 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1274 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001275
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001276 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001277 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001278 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001279 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1280 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001281 if (ret) {
1282 RETURN_ON_ERROR(ret);
1283 RETURN_SUCCESS;
1284 }
1285 ctx->u.rep->count = ctx->count-1;
1286 state->ptr = ctx->ptr;
1287 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001288 }
1289
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001290 LASTMARK_SAVE();
1291
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001292 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001293 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001294 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001295 if (ret) {
1296 RETURN_ON_ERROR(ret);
1297 RETURN_SUCCESS;
1298 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001299
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 state->repeat = ctx->u.rep;
1301 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001302
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001303 LASTMARK_RESTORE();
1304
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +02001305 if ((ctx->count >= ctx->u.rep->pattern[2]
1306 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1307 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001308 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001309
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 ctx->u.rep->count = ctx->count;
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +02001311 /* zero-width match protection */
1312 DATA_PUSH(&ctx->u.rep->last_ptr);
1313 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001314 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1315 ctx->u.rep->pattern+3);
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +02001316 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001317 if (ret) {
1318 RETURN_ON_ERROR(ret);
1319 RETURN_SUCCESS;
1320 }
1321 ctx->u.rep->count = ctx->count-1;
1322 state->ptr = ctx->ptr;
1323 RETURN_FAILURE;
1324
1325 case SRE_OP_GROUPREF:
1326 /* match backreference */
1327 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1328 ctx->ptr, ctx->pattern[0]));
1329 i = ctx->pattern[0];
1330 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001331 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001332 if (groupref >= state->lastmark) {
1333 RETURN_FAILURE;
1334 } else {
1335 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1336 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1337 if (!p || !e || e < p)
1338 RETURN_FAILURE;
1339 while (p < e) {
1340 if (ctx->ptr >= end || *ctx->ptr != *p)
1341 RETURN_FAILURE;
1342 p++; ctx->ptr++;
1343 }
1344 }
1345 }
1346 ctx->pattern++;
1347 break;
1348
1349 case SRE_OP_GROUPREF_IGNORE:
1350 /* match backreference */
1351 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1352 ctx->ptr, ctx->pattern[0]));
1353 i = ctx->pattern[0];
1354 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001355 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001356 if (groupref >= state->lastmark) {
1357 RETURN_FAILURE;
1358 } else {
1359 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1360 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1361 if (!p || !e || e < p)
1362 RETURN_FAILURE;
1363 while (p < e) {
1364 if (ctx->ptr >= end ||
1365 state->lower(*ctx->ptr) != state->lower(*p))
1366 RETURN_FAILURE;
1367 p++; ctx->ptr++;
1368 }
1369 }
1370 }
1371 ctx->pattern++;
1372 break;
1373
1374 case SRE_OP_GROUPREF_EXISTS:
1375 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1376 ctx->ptr, ctx->pattern[0]));
1377 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1378 i = ctx->pattern[0];
1379 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001380 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001381 if (groupref >= state->lastmark) {
1382 ctx->pattern += ctx->pattern[1];
1383 break;
1384 } else {
1385 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1386 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1387 if (!p || !e || e < p) {
1388 ctx->pattern += ctx->pattern[1];
1389 break;
1390 }
1391 }
1392 }
1393 ctx->pattern += 2;
1394 break;
1395
1396 case SRE_OP_ASSERT:
1397 /* assert subpattern */
1398 /* <ASSERT> <skip> <back> <pattern> */
1399 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1400 ctx->ptr, ctx->pattern[1]));
1401 state->ptr = ctx->ptr - ctx->pattern[1];
1402 if (state->ptr < state->beginning)
1403 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001404 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001405 RETURN_ON_FAILURE(ret);
1406 ctx->pattern += ctx->pattern[0];
1407 break;
1408
1409 case SRE_OP_ASSERT_NOT:
1410 /* assert not subpattern */
1411 /* <ASSERT_NOT> <skip> <back> <pattern> */
1412 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1413 ctx->ptr, ctx->pattern[1]));
1414 state->ptr = ctx->ptr - ctx->pattern[1];
1415 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001416 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001417 if (ret) {
1418 RETURN_ON_ERROR(ret);
1419 RETURN_FAILURE;
1420 }
1421 }
1422 ctx->pattern += ctx->pattern[0];
1423 break;
1424
1425 case SRE_OP_FAILURE:
1426 /* immediate failure */
1427 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1428 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001429
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001430 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001431 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1432 ctx->pattern[-1]));
1433 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001434 }
1435 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001436
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001437exit:
1438 ctx_pos = ctx->last_ctx_pos;
1439 jump = ctx->jump;
1440 DATA_POP_DISCARD(ctx);
1441 if (ctx_pos == -1)
1442 return ret;
1443 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1444
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001445 switch (jump) {
1446 case JUMP_MAX_UNTIL_2:
1447 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1448 goto jump_max_until_2;
1449 case JUMP_MAX_UNTIL_3:
1450 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1451 goto jump_max_until_3;
1452 case JUMP_MIN_UNTIL_2:
1453 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454 goto jump_min_until_2;
1455 case JUMP_MIN_UNTIL_3:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_3;
1458 case JUMP_BRANCH:
1459 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1460 goto jump_branch;
1461 case JUMP_MAX_UNTIL_1:
1462 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_max_until_1;
1464 case JUMP_MIN_UNTIL_1:
1465 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1466 goto jump_min_until_1;
1467 case JUMP_REPEAT:
1468 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1469 goto jump_repeat;
1470 case JUMP_REPEAT_ONE_1:
1471 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1472 goto jump_repeat_one_1;
1473 case JUMP_REPEAT_ONE_2:
1474 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1475 goto jump_repeat_one_2;
1476 case JUMP_MIN_REPEAT_ONE:
1477 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1478 goto jump_min_repeat_one;
1479 case JUMP_ASSERT:
1480 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1481 goto jump_assert;
1482 case JUMP_ASSERT_NOT:
1483 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1484 goto jump_assert_not;
1485 case JUMP_NONE:
1486 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1487 break;
1488 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001489
1490 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001491}
1492
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001493LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001494SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1495{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001496 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1497 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001498 Py_ssize_t status = 0;
1499 Py_ssize_t prefix_len = 0;
1500 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001501 SRE_CODE* prefix = NULL;
1502 SRE_CODE* charset = NULL;
1503 SRE_CODE* overlap = NULL;
1504 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001505
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001506 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001507 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001508 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001509
1510 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001511
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001512 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001513 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001514 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001515 end -= pattern[3]-1;
1516 if (end <= ptr)
1517 end = ptr+1;
1518 }
1519
Fredrik Lundh3562f112000-07-02 12:00:07 +00001520 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001521 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001522 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001523 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001524 prefix_skip = pattern[6];
1525 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001526 overlap = prefix + prefix_len - 1;
1527 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001528 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001529 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001530 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001531
1532 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001533 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001534
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001535 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1536 TRACE(("charset = %p\n", charset));
1537
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001538#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001539 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001540 /* pattern starts with a known prefix. use the overlap
1541 table to skip forward as fast as we possibly can */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001542 Py_ssize_t i = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001543 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001544 while (ptr < end) {
1545 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001546 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001547 if (!i)
1548 break;
1549 else
1550 i = overlap[i];
1551 } else {
1552 if (++i == prefix_len) {
1553 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001554 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1555 state->start = ptr + 1 - prefix_len;
1556 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001557 if (flags & SRE_INFO_LITERAL)
1558 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001559 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001560 if (status != 0)
1561 return status;
1562 /* close but no cigar -- try again */
1563 i = overlap[i];
1564 }
1565 break;
1566 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001567 }
1568 ptr++;
1569 }
1570 return 0;
1571 }
1572#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001573
Fredrik Lundh3562f112000-07-02 12:00:07 +00001574 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001575 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001576 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 SRE_CODE chr = pattern[1];
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001578 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001579 for (;;) {
1580 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1581 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001582 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001583 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001584 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001585 state->start = ptr;
1586 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001587 if (flags & SRE_INFO_LITERAL)
1588 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001589 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 if (status != 0)
1591 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001592 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 } else if (charset) {
1594 /* pattern starts with a character from a known set */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001595 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001596 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001597 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001598 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001599 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001600 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001601 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001602 state->start = ptr;
1603 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001604 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001605 if (status != 0)
1606 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001607 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001608 }
1609 } else
1610 /* general case */
1611 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001612 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001613 state->start = state->ptr = ptr++;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001614 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001615 if (status != 0)
1616 break;
1617 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001618
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001619 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001620}
Tim Peters3d563502006-01-21 02:47:53 +00001621
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001622LOCAL(int)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001623SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001624{
1625 /* check if given string is a literal template (i.e. no escapes) */
1626 while (len-- > 0)
1627 if (*ptr++ == '\\')
1628 return 0;
1629 return 1;
1630}
Guido van Rossumb700df92000-03-31 14:59:30 +00001631
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001632#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001633
1634/* -------------------------------------------------------------------- */
1635/* factories and destructors */
1636
1637/* see sre.h for object declarations */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001638static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1639static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001640
1641static PyObject *
Georg Brandl964f5972006-05-28 22:38:57 +00001642sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001643{
Benjamin Peterson9dccb012013-01-10 10:37:47 -06001644 return PyInt_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001645}
1646
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001647static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001648sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001649{
1650 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001651 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001652 return NULL;
1653 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001654 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001655 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001656#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001657 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001658#else
1659 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001660#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001661 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001662}
1663
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001664LOCAL(void)
1665state_reset(SRE_STATE* state)
1666{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001667 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001668 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001669
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001670 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001671 state->lastindex = -1;
1672
1673 state->repeat = NULL;
1674
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001675 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001676}
1677
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001678static void*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001679getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001680{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001681 /* given a python object, return a data pointer, a length (in
1682 characters), and a character size. return NULL if the object
1683 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001684
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001685 PyBufferProcs *buffer;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001686 Py_ssize_t size, bytes;
1687 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001688 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001689
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001690#if defined(HAVE_UNICODE)
1691 if (PyUnicode_Check(string)) {
1692 /* unicode strings doesn't always support the buffer interface */
1693 ptr = (void*) PyUnicode_AS_DATA(string);
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00001694 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001695 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001696 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001697
1698 } else {
1699#endif
1700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 /* get pointer to string buffer */
Christian Heimese93237d2007-12-19 02:37:44 +00001702 buffer = Py_TYPE(string)->tp_as_buffer;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001703 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1704 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001705 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001706 return NULL;
1707 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001708
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001709 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001710 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1711 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001712 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1713 return NULL;
1714 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001715
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001716 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001717#if PY_VERSION_HEX >= 0x01060000
1718 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001719#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001720 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001721#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001722
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001723 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001724 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001725#if defined(HAVE_UNICODE)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001726 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001727 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001728#endif
1729 else {
1730 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1731 return NULL;
1732 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001733
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001734#if defined(HAVE_UNICODE)
1735 }
1736#endif
1737
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001738 *p_length = size;
1739 *p_charsize = charsize;
1740
1741 return ptr;
1742}
1743
1744LOCAL(PyObject*)
1745state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001746 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001747{
1748 /* prepare state object */
1749
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001750 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001751 int charsize;
1752 void* ptr;
1753
1754 memset(state, 0, sizeof(SRE_STATE));
1755
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001756 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001757 state->lastindex = -1;
1758
1759 ptr = getstring(string, &length, &charsize);
1760 if (!ptr)
1761 return NULL;
1762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001763 /* adjust boundaries */
1764 if (start < 0)
1765 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 else if (start > length)
1767 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001768
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001769 if (end < 0)
1770 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001771 else if (end > length)
1772 end = length;
1773
1774 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001775
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001776 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001777
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001778 state->start = (void*) ((char*) ptr + start * state->charsize);
1779 state->end = (void*) ((char*) ptr + end * state->charsize);
1780
1781 Py_INCREF(string);
1782 state->string = string;
1783 state->pos = start;
1784 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001785
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001786 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001787 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001788 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001789#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001790 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001791#else
1792 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001793#endif
1794 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001795 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001796
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001798}
1799
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001800LOCAL(void)
1801state_fini(SRE_STATE* state)
1802{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001803 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001804 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001805}
1806
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001807/* calculate offset from start of string */
1808#define STATE_OFFSET(state, member)\
1809 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1810
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001811LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001812state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001813{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001814 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001815
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001816 index = (index - 1) * 2;
1817
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001818 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001819 if (empty)
1820 /* want empty string */
1821 i = j = 0;
1822 else {
1823 Py_INCREF(Py_None);
1824 return Py_None;
1825 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001826 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001827 i = STATE_OFFSET(state, state->mark[index]);
1828 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001829 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001830
Fredrik Lundh58100642000-08-09 09:14:35 +00001831 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001832}
1833
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001834static void
1835pattern_error(int status)
1836{
1837 switch (status) {
1838 case SRE_ERROR_RECURSION_LIMIT:
1839 PyErr_SetString(
1840 PyExc_RuntimeError,
1841 "maximum recursion limit exceeded"
1842 );
1843 break;
1844 case SRE_ERROR_MEMORY:
1845 PyErr_NoMemory();
1846 break;
Facundo Batista4473d222008-01-08 21:10:12 +00001847 case SRE_ERROR_INTERRUPTED:
1848 /* An exception has already been raised, so let it fly */
1849 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001850 default:
1851 /* other error codes indicate compiler/engine bugs */
1852 PyErr_SetString(
1853 PyExc_RuntimeError,
1854 "internal error in regular expression engine"
1855 );
1856 }
1857}
1858
Guido van Rossumb700df92000-03-31 14:59:30 +00001859static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001860pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001861{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001862 if (self->weakreflist != NULL)
1863 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001864 Py_XDECREF(self->pattern);
1865 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001866 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001867 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001868}
1869
1870static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001871pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001872{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001873 SRE_STATE state;
1874 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001875
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001877 Py_ssize_t start = 0;
1878 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001879 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001880 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001881 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001884 string = state_init(&state, self, string, start, end);
1885 if (!string)
1886 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001888 state.ptr = state.start;
1889
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001890 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1891
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001892 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001893 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001894 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001895#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001896 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001897#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001898 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001899
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001900 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001901 if (PyErr_Occurred())
1902 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001903
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001905
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001906 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001907}
1908
1909static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001910pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001911{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001912 SRE_STATE state;
1913 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001915 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001916 Py_ssize_t start = 0;
1917 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001918 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001919 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001920 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001921 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001922
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 string = state_init(&state, self, string, start, end);
1924 if (!string)
1925 return NULL;
1926
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001927 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1928
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001929 if (state.charsize == 1) {
1930 status = sre_search(&state, PatternObject_GetCode(self));
1931 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001932#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001933 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001934#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001935 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001936
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001937 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1938
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001939 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001940
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001941 if (PyErr_Occurred())
1942 return NULL;
1943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001944 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001945}
1946
1947static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001948call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001949{
1950 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001951 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952 PyObject* func;
1953 PyObject* result;
1954
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001955 if (!args)
1956 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001957 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001958 if (!name)
1959 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001960 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001961 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001962 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001963 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001964 func = PyObject_GetAttrString(mod, function);
1965 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001966 if (!func)
1967 return NULL;
1968 result = PyObject_CallObject(func, args);
1969 Py_DECREF(func);
1970 Py_DECREF(args);
1971 return result;
1972}
1973
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001974#ifdef USE_BUILTIN_COPY
1975static int
1976deepcopy(PyObject** object, PyObject* memo)
1977{
1978 PyObject* copy;
1979
1980 copy = call(
1981 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001982 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001983 );
1984 if (!copy)
1985 return 0;
1986
1987 Py_DECREF(*object);
1988 *object = copy;
1989
1990 return 1; /* success */
1991}
1992#endif
1993
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001994static PyObject*
Guido van Rossum1ff91d92007-09-10 22:02:25 +00001995join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001996{
1997 /* join list elements */
1998
1999 PyObject* joiner;
2000#if PY_VERSION_HEX >= 0x01060000
2001 PyObject* function;
2002 PyObject* args;
2003#endif
2004 PyObject* result;
2005
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002006 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002007 if (!joiner)
2008 return NULL;
2009
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002010 if (PyList_GET_SIZE(list) == 0) {
2011 Py_DECREF(list);
2012 return joiner;
2013 }
2014
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002015#if PY_VERSION_HEX >= 0x01060000
2016 function = PyObject_GetAttrString(joiner, "join");
2017 if (!function) {
2018 Py_DECREF(joiner);
2019 return NULL;
2020 }
2021 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002022 if (!args) {
2023 Py_DECREF(function);
2024 Py_DECREF(joiner);
2025 return NULL;
2026 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002027 PyTuple_SET_ITEM(args, 0, list);
2028 result = PyObject_CallObject(function, args);
2029 Py_DECREF(args); /* also removes list */
2030 Py_DECREF(function);
2031#else
2032 result = call(
2033 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002034 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002035 );
2036#endif
2037 Py_DECREF(joiner);
2038
2039 return result;
2040}
2041
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002042static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002043pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002044{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002045 SRE_STATE state;
2046 PyObject* list;
2047 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002048 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002049
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002051 Py_ssize_t start = 0;
2052 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002053 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002054 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002055 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002057
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 string = state_init(&state, self, string, start, end);
2059 if (!string)
2060 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002062 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002063 if (!list) {
2064 state_fini(&state);
2065 return NULL;
2066 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002069
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002070 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002071
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002072 state_reset(&state);
2073
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 state.ptr = state.start;
2075
2076 if (state.charsize == 1) {
2077 status = sre_search(&state, PatternObject_GetCode(self));
2078 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002079#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002080 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002081#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002083
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002084 if (PyErr_Occurred())
2085 goto error;
2086
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002087 if (status <= 0) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002088 if (status == 0)
2089 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002090 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002091 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002092 }
Tim Peters3d563502006-01-21 02:47:53 +00002093
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002094 /* don't bother to build a match object */
2095 switch (self->groups) {
2096 case 0:
2097 b = STATE_OFFSET(&state, state.start);
2098 e = STATE_OFFSET(&state, state.ptr);
2099 item = PySequence_GetSlice(string, b, e);
2100 if (!item)
2101 goto error;
2102 break;
2103 case 1:
2104 item = state_getslice(&state, 1, string, 1);
2105 if (!item)
2106 goto error;
2107 break;
2108 default:
2109 item = PyTuple_New(self->groups);
2110 if (!item)
2111 goto error;
2112 for (i = 0; i < self->groups; i++) {
2113 PyObject* o = state_getslice(&state, i+1, string, 1);
2114 if (!o) {
2115 Py_DECREF(item);
2116 goto error;
2117 }
2118 PyTuple_SET_ITEM(item, i, o);
2119 }
2120 break;
2121 }
2122
2123 status = PyList_Append(list, item);
2124 Py_DECREF(item);
2125 if (status < 0)
2126 goto error;
2127
2128 if (state.ptr == state.start)
2129 state.start = (void*) ((char*) state.ptr + state.charsize);
2130 else
2131 state.start = state.ptr;
2132
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002133 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002134
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 state_fini(&state);
2136 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002137
2138error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002139 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002140 state_fini(&state);
2141 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002142
Guido van Rossumb700df92000-03-31 14:59:30 +00002143}
2144
Fredrik Lundh703ce812001-10-24 22:16:30 +00002145#if PY_VERSION_HEX >= 0x02020000
2146static PyObject*
2147pattern_finditer(PatternObject* pattern, PyObject* args)
2148{
2149 PyObject* scanner;
2150 PyObject* search;
2151 PyObject* iterator;
2152
2153 scanner = pattern_scanner(pattern, args);
2154 if (!scanner)
2155 return NULL;
2156
2157 search = PyObject_GetAttrString(scanner, "search");
2158 Py_DECREF(scanner);
2159 if (!search)
2160 return NULL;
2161
2162 iterator = PyCallIter_New(search, Py_None);
2163 Py_DECREF(search);
2164
2165 return iterator;
2166}
2167#endif
2168
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002169static PyObject*
2170pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2171{
2172 SRE_STATE state;
2173 PyObject* list;
2174 PyObject* item;
2175 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002176 Py_ssize_t n;
2177 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002178 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002179
2180 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002181 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002182 static char* kwlist[] = { "source", "maxsplit", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002183 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002184 &string, &maxsplit))
2185 return NULL;
2186
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002187 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002188 if (!string)
2189 return NULL;
2190
2191 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002192 if (!list) {
2193 state_fini(&state);
2194 return NULL;
2195 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002196
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002197 n = 0;
2198 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002199
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002200 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002201
2202 state_reset(&state);
2203
2204 state.ptr = state.start;
2205
2206 if (state.charsize == 1) {
2207 status = sre_search(&state, PatternObject_GetCode(self));
2208 } else {
2209#if defined(HAVE_UNICODE)
2210 status = sre_usearch(&state, PatternObject_GetCode(self));
2211#endif
2212 }
2213
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002214 if (PyErr_Occurred())
2215 goto error;
2216
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002217 if (status <= 0) {
2218 if (status == 0)
2219 break;
2220 pattern_error(status);
2221 goto error;
2222 }
Tim Peters3d563502006-01-21 02:47:53 +00002223
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002224 if (state.start == state.ptr) {
2225 if (last == state.end)
2226 break;
2227 /* skip one character */
2228 state.start = (void*) ((char*) state.ptr + state.charsize);
2229 continue;
2230 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002231
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002232 /* get segment before this match */
2233 item = PySequence_GetSlice(
2234 string, STATE_OFFSET(&state, last),
2235 STATE_OFFSET(&state, state.start)
2236 );
2237 if (!item)
2238 goto error;
2239 status = PyList_Append(list, item);
2240 Py_DECREF(item);
2241 if (status < 0)
2242 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002243
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002244 /* add groups (if any) */
2245 for (i = 0; i < self->groups; i++) {
2246 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002247 if (!item)
2248 goto error;
2249 status = PyList_Append(list, item);
2250 Py_DECREF(item);
2251 if (status < 0)
2252 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002253 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002254
2255 n = n + 1;
2256
2257 last = state.start = state.ptr;
2258
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002259 }
2260
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002261 /* get segment following last match (even if empty) */
2262 item = PySequence_GetSlice(
2263 string, STATE_OFFSET(&state, last), state.endpos
2264 );
2265 if (!item)
2266 goto error;
2267 status = PyList_Append(list, item);
2268 Py_DECREF(item);
2269 if (status < 0)
2270 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002271
2272 state_fini(&state);
2273 return list;
2274
2275error:
2276 Py_DECREF(list);
2277 state_fini(&state);
2278 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002279
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002280}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002281
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002282static PyObject*
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002283pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002284 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002285{
2286 SRE_STATE state;
2287 PyObject* list;
2288 PyObject* item;
2289 PyObject* filter;
2290 PyObject* args;
2291 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002292 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002293 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002294 Py_ssize_t n;
2295 Py_ssize_t i, b, e;
2296 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002297 int filter_is_callable;
2298
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002299 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002300 /* sub/subn takes either a function or a template */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002301 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002302 Py_INCREF(filter);
2303 filter_is_callable = 1;
2304 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002305 /* if not callable, check if it's a literal string */
2306 int literal;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002307 ptr = getstring(ptemplate, &n, &bint);
2308 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002309 if (ptr) {
2310 if (b == 1) {
Skip Montanaro816a1622006-04-18 11:53:09 +00002311 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002312 } else {
2313#if defined(HAVE_UNICODE)
Skip Montanaro816a1622006-04-18 11:53:09 +00002314 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002315#endif
2316 }
2317 } else {
2318 PyErr_Clear();
2319 literal = 0;
2320 }
2321 if (literal) {
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002322 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002323 Py_INCREF(filter);
2324 filter_is_callable = 0;
2325 } else {
2326 /* not a literal; hand it over to the template compiler */
2327 filter = call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002328 SRE_PY_MODULE, "_subx",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002329 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002330 );
2331 if (!filter)
2332 return NULL;
2333 filter_is_callable = PyCallable_Check(filter);
2334 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002335 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002336
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002337 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002338 if (!string) {
2339 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002340 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002341 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002342
2343 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002344 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002345 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002346 state_fini(&state);
2347 return NULL;
2348 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002349
2350 n = i = 0;
2351
2352 while (!count || n < count) {
2353
2354 state_reset(&state);
2355
2356 state.ptr = state.start;
2357
2358 if (state.charsize == 1) {
2359 status = sre_search(&state, PatternObject_GetCode(self));
2360 } else {
2361#if defined(HAVE_UNICODE)
2362 status = sre_usearch(&state, PatternObject_GetCode(self));
2363#endif
2364 }
2365
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002366 if (PyErr_Occurred())
2367 goto error;
2368
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002369 if (status <= 0) {
2370 if (status == 0)
2371 break;
2372 pattern_error(status);
2373 goto error;
2374 }
Tim Peters3d563502006-01-21 02:47:53 +00002375
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002376 b = STATE_OFFSET(&state, state.start);
2377 e = STATE_OFFSET(&state, state.ptr);
2378
2379 if (i < b) {
2380 /* get segment before this match */
2381 item = PySequence_GetSlice(string, i, b);
2382 if (!item)
2383 goto error;
2384 status = PyList_Append(list, item);
2385 Py_DECREF(item);
2386 if (status < 0)
2387 goto error;
2388
2389 } else if (i == b && i == e && n > 0)
2390 /* ignore empty match on latest position */
2391 goto next;
2392
2393 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002394 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002395 match = pattern_new_match(self, &state, 1);
2396 if (!match)
2397 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002398 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002399 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002400 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002401 goto error;
2402 }
2403 item = PyObject_CallObject(filter, args);
2404 Py_DECREF(args);
2405 Py_DECREF(match);
2406 if (!item)
2407 goto error;
2408 } else {
2409 /* filter is literal string */
2410 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002411 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002412 }
2413
2414 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002415 if (item != Py_None) {
2416 status = PyList_Append(list, item);
2417 Py_DECREF(item);
2418 if (status < 0)
2419 goto error;
2420 }
Tim Peters3d563502006-01-21 02:47:53 +00002421
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002422 i = e;
2423 n = n + 1;
2424
2425next:
2426 /* move on */
2427 if (state.ptr == state.start)
2428 state.start = (void*) ((char*) state.ptr + state.charsize);
2429 else
2430 state.start = state.ptr;
2431
2432 }
2433
2434 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002435 if (i < state.endpos) {
2436 item = PySequence_GetSlice(string, i, state.endpos);
2437 if (!item)
2438 goto error;
2439 status = PyList_Append(list, item);
2440 Py_DECREF(item);
2441 if (status < 0)
2442 goto error;
2443 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002444
2445 state_fini(&state);
2446
Guido van Rossum4e173842001-12-07 04:25:10 +00002447 Py_DECREF(filter);
2448
Fredrik Lundhdac58492001-10-21 21:48:30 +00002449 /* convert list to single string (also removes list) */
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002450 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002451
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002452 if (!item)
2453 return NULL;
2454
2455 if (subn)
Antoine Pitroub83575b2012-12-02 12:52:36 +01002456 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002457
2458 return item;
2459
2460error:
2461 Py_DECREF(list);
2462 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002463 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002464 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002465
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466}
2467
2468static PyObject*
2469pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2470{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002471 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002472 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002473 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002474 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002475 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002476 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477 return NULL;
2478
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002479 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002480}
2481
2482static PyObject*
2483pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2484{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002485 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002486 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002487 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002488 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002489 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002490 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002491 return NULL;
2492
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002493 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002494}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002495
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002496static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00002497pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002498{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002499#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002500 PatternObject* copy;
2501 int offset;
2502
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002503 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2504 if (!copy)
2505 return NULL;
2506
2507 offset = offsetof(PatternObject, groups);
2508
2509 Py_XINCREF(self->groupindex);
2510 Py_XINCREF(self->indexgroup);
2511 Py_XINCREF(self->pattern);
2512
2513 memcpy((char*) copy + offset, (char*) self + offset,
2514 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002515 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002516
2517 return (PyObject*) copy;
2518#else
2519 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2520 return NULL;
2521#endif
2522}
2523
2524static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00002525pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002526{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002527#ifdef USE_BUILTIN_COPY
2528 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002529
Georg Brandlfbef5882006-05-28 22:14:04 +00002530 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002531 if (!copy)
2532 return NULL;
2533
2534 if (!deepcopy(&copy->groupindex, memo) ||
2535 !deepcopy(&copy->indexgroup, memo) ||
2536 !deepcopy(&copy->pattern, memo)) {
2537 Py_DECREF(copy);
2538 return NULL;
2539 }
2540
2541#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002542 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2543 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002544#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002545}
2546
Raymond Hettinger94478742004-09-24 04:31:19 +00002547PyDoc_STRVAR(pattern_match_doc,
2548"match(string[, pos[, endpos]]) --> match object or None.\n\
2549 Matches zero or more characters at the beginning of the string");
2550
2551PyDoc_STRVAR(pattern_search_doc,
2552"search(string[, pos[, endpos]]) --> match object or None.\n\
2553 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlovc08ded92012-12-25 18:50:03 +02002554 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002555
2556PyDoc_STRVAR(pattern_split_doc,
2557"split(string[, maxsplit = 0]) --> list.\n\
2558 Split string by the occurrences of pattern.");
2559
2560PyDoc_STRVAR(pattern_findall_doc,
2561"findall(string[, pos[, endpos]]) --> list.\n\
2562 Return a list of all non-overlapping matches of pattern in string.");
2563
2564PyDoc_STRVAR(pattern_finditer_doc,
2565"finditer(string[, pos[, endpos]]) --> iterator.\n\
2566 Return an iterator over all non-overlapping matches for the \n\
2567 RE pattern in string. For each match, the iterator returns a\n\
2568 match object.");
2569
2570PyDoc_STRVAR(pattern_sub_doc,
2571"sub(repl, string[, count = 0]) --> newstring\n\
2572 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002573 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002574
2575PyDoc_STRVAR(pattern_subn_doc,
2576"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2577 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2578 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002579 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002580
2581PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2582
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002583static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002584 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002585 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002586 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002587 pattern_search_doc},
2588 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2589 pattern_sub_doc},
2590 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2591 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002592 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002593 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002594 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002595 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002596#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002597 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2598 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002599#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002600 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00002601 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2602 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002603 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002604};
2605
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002606#define PAT_OFF(x) offsetof(PatternObject, x)
2607static PyMemberDef pattern_members[] = {
2608 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2609 {"flags", T_INT, PAT_OFF(flags), READONLY},
2610 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2611 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2612 {NULL} /* Sentinel */
2613};
Guido van Rossumb700df92000-03-31 14:59:30 +00002614
2615statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002616 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002617 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002618 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002619 (destructor)pattern_dealloc, /*tp_dealloc*/
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002620 0, /* tp_print */
2621 0, /* tp_getattrn */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002622 0, /* tp_setattr */
2623 0, /* tp_compare */
2624 0, /* tp_repr */
2625 0, /* tp_as_number */
2626 0, /* tp_as_sequence */
2627 0, /* tp_as_mapping */
2628 0, /* tp_hash */
2629 0, /* tp_call */
2630 0, /* tp_str */
2631 0, /* tp_getattro */
2632 0, /* tp_setattro */
2633 0, /* tp_as_buffer */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002634 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002635 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002636 0, /* tp_traverse */
2637 0, /* tp_clear */
2638 0, /* tp_richcompare */
2639 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002640 0, /* tp_iter */
2641 0, /* tp_iternext */
2642 pattern_methods, /* tp_methods */
2643 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002644};
2645
Guido van Rossum8b762f02008-08-05 03:39:21 +00002646static int _validate(PatternObject *self); /* Forward */
2647
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002648static PyObject *
2649_compile(PyObject* self_, PyObject* args)
2650{
2651 /* "compile" pattern descriptor to pattern object */
2652
2653 PatternObject* self;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002654 Py_ssize_t i, n;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002655
2656 PyObject* pattern;
2657 int flags = 0;
2658 PyObject* code;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002659 Py_ssize_t groups = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002660 PyObject* groupindex = NULL;
2661 PyObject* indexgroup = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002662 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002663 &PyList_Type, &code, &groups,
2664 &groupindex, &indexgroup))
2665 return NULL;
2666
2667 n = PyList_GET_SIZE(code);
Christian Heimes4956d2b2008-01-18 19:12:56 +00002668 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002669 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2670 if (!self)
2671 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002672 self->weakreflist = NULL;
2673 self->pattern = NULL;
2674 self->groupindex = NULL;
2675 self->indexgroup = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002676
2677 self->codesize = n;
2678
2679 for (i = 0; i < n; i++) {
2680 PyObject *o = PyList_GET_ITEM(code, i);
2681 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2682 : PyLong_AsUnsignedLong(o);
Antoine Pitroub83ea142012-11-20 22:30:42 +01002683 if (value == (unsigned long)-1 && PyErr_Occurred()) {
2684 if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2685 PyErr_SetString(PyExc_OverflowError,
2686 "regular expression code size limit exceeded");
2687 }
2688 break;
2689 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002690 self->code[i] = (SRE_CODE) value;
2691 if ((unsigned long) self->code[i] != value) {
2692 PyErr_SetString(PyExc_OverflowError,
2693 "regular expression code size limit exceeded");
2694 break;
2695 }
2696 }
2697
2698 if (PyErr_Occurred()) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002699 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002700 return NULL;
2701 }
2702
2703 Py_INCREF(pattern);
2704 self->pattern = pattern;
2705
2706 self->flags = flags;
2707
2708 self->groups = groups;
2709
2710 Py_XINCREF(groupindex);
2711 self->groupindex = groupindex;
2712
2713 Py_XINCREF(indexgroup);
2714 self->indexgroup = indexgroup;
2715
2716 self->weakreflist = NULL;
2717
Guido van Rossum8b762f02008-08-05 03:39:21 +00002718 if (!_validate(self)) {
2719 Py_DECREF(self);
2720 return NULL;
2721 }
2722
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002723 return (PyObject*) self;
2724}
2725
Guido van Rossumb700df92000-03-31 14:59:30 +00002726/* -------------------------------------------------------------------- */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002727/* Code validation */
2728
2729/* To learn more about this code, have a look at the _compile() function in
2730 Lib/sre_compile.py. The validation functions below checks the code array
2731 for conformance with the code patterns generated there.
2732
2733 The nice thing about the generated code is that it is position-independent:
2734 all jumps are relative jumps forward. Also, jumps don't cross each other:
2735 the target of a later jump is always earlier than the target of an earlier
2736 jump. IOW, this is okay:
2737
2738 J---------J-------T--------T
2739 \ \_____/ /
2740 \______________________/
2741
2742 but this is not:
2743
2744 J---------J-------T--------T
2745 \_________\_____/ /
2746 \____________/
2747
2748 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2749 bytes wide (the latter if Python is compiled for "wide" unicode support).
2750*/
2751
2752/* Defining this one enables tracing of the validator */
2753#undef VVERBOSE
2754
2755/* Trace macro for the validator */
2756#if defined(VVERBOSE)
2757#define VTRACE(v) printf v
2758#else
Senthil Kumarand5830682011-10-20 02:13:23 +08002759#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002760#endif
2761
2762/* Report failure */
2763#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2764
2765/* Extract opcode, argument, or skip count from code array */
2766#define GET_OP \
2767 do { \
2768 VTRACE(("%p: ", code)); \
2769 if (code >= end) FAIL; \
2770 op = *code++; \
2771 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2772 } while (0)
2773#define GET_ARG \
2774 do { \
2775 VTRACE(("%p= ", code)); \
2776 if (code >= end) FAIL; \
2777 arg = *code++; \
2778 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2779 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002780#define GET_SKIP_ADJ(adj) \
Guido van Rossum8b762f02008-08-05 03:39:21 +00002781 do { \
2782 VTRACE(("%p= ", code)); \
2783 if (code >= end) FAIL; \
2784 skip = *code; \
2785 VTRACE(("%lu (skip to %p)\n", \
2786 (unsigned long)skip, code+skip)); \
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002787 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum8b762f02008-08-05 03:39:21 +00002788 FAIL; \
2789 code++; \
2790 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002791#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002792
2793static int
2794_validate_charset(SRE_CODE *code, SRE_CODE *end)
2795{
2796 /* Some variables are manipulated by the macros above */
2797 SRE_CODE op;
2798 SRE_CODE arg;
2799 SRE_CODE offset;
2800 int i;
2801
2802 while (code < end) {
2803 GET_OP;
2804 switch (op) {
2805
2806 case SRE_OP_NEGATE:
2807 break;
2808
2809 case SRE_OP_LITERAL:
2810 GET_ARG;
2811 break;
2812
2813 case SRE_OP_RANGE:
2814 GET_ARG;
2815 GET_ARG;
2816 break;
2817
2818 case SRE_OP_CHARSET:
2819 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2820 if (code+offset < code || code+offset > end)
2821 FAIL;
2822 code += offset;
2823 break;
2824
2825 case SRE_OP_BIGCHARSET:
2826 GET_ARG; /* Number of blocks */
2827 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2828 if (code+offset < code || code+offset > end)
2829 FAIL;
2830 /* Make sure that each byte points to a valid block */
2831 for (i = 0; i < 256; i++) {
2832 if (((unsigned char *)code)[i] >= arg)
2833 FAIL;
2834 }
2835 code += offset;
2836 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2837 if (code+offset < code || code+offset > end)
2838 FAIL;
2839 code += offset;
2840 break;
2841
2842 case SRE_OP_CATEGORY:
2843 GET_ARG;
2844 switch (arg) {
2845 case SRE_CATEGORY_DIGIT:
2846 case SRE_CATEGORY_NOT_DIGIT:
2847 case SRE_CATEGORY_SPACE:
2848 case SRE_CATEGORY_NOT_SPACE:
2849 case SRE_CATEGORY_WORD:
2850 case SRE_CATEGORY_NOT_WORD:
2851 case SRE_CATEGORY_LINEBREAK:
2852 case SRE_CATEGORY_NOT_LINEBREAK:
2853 case SRE_CATEGORY_LOC_WORD:
2854 case SRE_CATEGORY_LOC_NOT_WORD:
2855 case SRE_CATEGORY_UNI_DIGIT:
2856 case SRE_CATEGORY_UNI_NOT_DIGIT:
2857 case SRE_CATEGORY_UNI_SPACE:
2858 case SRE_CATEGORY_UNI_NOT_SPACE:
2859 case SRE_CATEGORY_UNI_WORD:
2860 case SRE_CATEGORY_UNI_NOT_WORD:
2861 case SRE_CATEGORY_UNI_LINEBREAK:
2862 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2863 break;
2864 default:
2865 FAIL;
2866 }
2867 break;
2868
2869 default:
2870 FAIL;
2871
2872 }
2873 }
2874
2875 return 1;
2876}
2877
2878static int
2879_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2880{
2881 /* Some variables are manipulated by the macros above */
2882 SRE_CODE op;
2883 SRE_CODE arg;
2884 SRE_CODE skip;
2885
2886 VTRACE(("code=%p, end=%p\n", code, end));
2887
2888 if (code > end)
2889 FAIL;
2890
2891 while (code < end) {
2892 GET_OP;
2893 switch (op) {
2894
2895 case SRE_OP_MARK:
2896 /* We don't check whether marks are properly nested; the
2897 sre_match() code is robust even if they don't, and the worst
2898 you can get is nonsensical match results. */
2899 GET_ARG;
2900 if (arg > 2*groups+1) {
2901 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2902 FAIL;
2903 }
2904 break;
2905
2906 case SRE_OP_LITERAL:
2907 case SRE_OP_NOT_LITERAL:
2908 case SRE_OP_LITERAL_IGNORE:
2909 case SRE_OP_NOT_LITERAL_IGNORE:
2910 GET_ARG;
2911 /* The arg is just a character, nothing to check */
2912 break;
2913
2914 case SRE_OP_SUCCESS:
2915 case SRE_OP_FAILURE:
2916 /* Nothing to check; these normally end the matching process */
2917 break;
2918
2919 case SRE_OP_AT:
2920 GET_ARG;
2921 switch (arg) {
2922 case SRE_AT_BEGINNING:
2923 case SRE_AT_BEGINNING_STRING:
2924 case SRE_AT_BEGINNING_LINE:
2925 case SRE_AT_END:
2926 case SRE_AT_END_LINE:
2927 case SRE_AT_END_STRING:
2928 case SRE_AT_BOUNDARY:
2929 case SRE_AT_NON_BOUNDARY:
2930 case SRE_AT_LOC_BOUNDARY:
2931 case SRE_AT_LOC_NON_BOUNDARY:
2932 case SRE_AT_UNI_BOUNDARY:
2933 case SRE_AT_UNI_NON_BOUNDARY:
2934 break;
2935 default:
2936 FAIL;
2937 }
2938 break;
2939
2940 case SRE_OP_ANY:
2941 case SRE_OP_ANY_ALL:
2942 /* These have no operands */
2943 break;
2944
2945 case SRE_OP_IN:
2946 case SRE_OP_IN_IGNORE:
2947 GET_SKIP;
2948 /* Stop 1 before the end; we check the FAILURE below */
2949 if (!_validate_charset(code, code+skip-2))
2950 FAIL;
2951 if (code[skip-2] != SRE_OP_FAILURE)
2952 FAIL;
2953 code += skip-1;
2954 break;
2955
2956 case SRE_OP_INFO:
2957 {
2958 /* A minimal info field is
2959 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2960 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2961 more follows. */
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002962 SRE_CODE flags, i;
Guido van Rossum8b762f02008-08-05 03:39:21 +00002963 SRE_CODE *newcode;
2964 GET_SKIP;
2965 newcode = code+skip-1;
2966 GET_ARG; flags = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002967 GET_ARG; /* min */
2968 GET_ARG; /* max */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002969 /* Check that only valid flags are present */
2970 if ((flags & ~(SRE_INFO_PREFIX |
2971 SRE_INFO_LITERAL |
2972 SRE_INFO_CHARSET)) != 0)
2973 FAIL;
2974 /* PREFIX and CHARSET are mutually exclusive */
2975 if ((flags & SRE_INFO_PREFIX) &&
2976 (flags & SRE_INFO_CHARSET))
2977 FAIL;
2978 /* LITERAL implies PREFIX */
2979 if ((flags & SRE_INFO_LITERAL) &&
2980 !(flags & SRE_INFO_PREFIX))
2981 FAIL;
2982 /* Validate the prefix */
2983 if (flags & SRE_INFO_PREFIX) {
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002984 SRE_CODE prefix_len;
Guido van Rossum8b762f02008-08-05 03:39:21 +00002985 GET_ARG; prefix_len = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00002986 GET_ARG; /* prefix skip */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002987 /* Here comes the prefix string */
2988 if (code+prefix_len < code || code+prefix_len > newcode)
2989 FAIL;
2990 code += prefix_len;
2991 /* And here comes the overlap table */
2992 if (code+prefix_len < code || code+prefix_len > newcode)
2993 FAIL;
2994 /* Each overlap value should be < prefix_len */
2995 for (i = 0; i < prefix_len; i++) {
2996 if (code[i] >= prefix_len)
2997 FAIL;
2998 }
2999 code += prefix_len;
3000 }
3001 /* Validate the charset */
3002 if (flags & SRE_INFO_CHARSET) {
3003 if (!_validate_charset(code, newcode-1))
3004 FAIL;
3005 if (newcode[-1] != SRE_OP_FAILURE)
3006 FAIL;
3007 code = newcode;
3008 }
3009 else if (code != newcode) {
3010 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3011 FAIL;
3012 }
3013 }
3014 break;
3015
3016 case SRE_OP_BRANCH:
3017 {
3018 SRE_CODE *target = NULL;
3019 for (;;) {
3020 GET_SKIP;
3021 if (skip == 0)
3022 break;
3023 /* Stop 2 before the end; we check the JUMP below */
3024 if (!_validate_inner(code, code+skip-3, groups))
3025 FAIL;
3026 code += skip-3;
3027 /* Check that it ends with a JUMP, and that each JUMP
3028 has the same target */
3029 GET_OP;
3030 if (op != SRE_OP_JUMP)
3031 FAIL;
3032 GET_SKIP;
3033 if (target == NULL)
3034 target = code+skip-1;
3035 else if (code+skip-1 != target)
3036 FAIL;
3037 }
3038 }
3039 break;
3040
3041 case SRE_OP_REPEAT_ONE:
3042 case SRE_OP_MIN_REPEAT_ONE:
3043 {
3044 SRE_CODE min, max;
3045 GET_SKIP;
3046 GET_ARG; min = arg;
3047 GET_ARG; max = arg;
3048 if (min > max)
3049 FAIL;
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003050 if (max > SRE_MAXREPEAT)
Guido van Rossum8b762f02008-08-05 03:39:21 +00003051 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003052 if (!_validate_inner(code, code+skip-4, groups))
3053 FAIL;
3054 code += skip-4;
3055 GET_OP;
3056 if (op != SRE_OP_SUCCESS)
3057 FAIL;
3058 }
3059 break;
3060
3061 case SRE_OP_REPEAT:
3062 {
3063 SRE_CODE min, max;
3064 GET_SKIP;
3065 GET_ARG; min = arg;
3066 GET_ARG; max = arg;
3067 if (min > max)
3068 FAIL;
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003069 if (max > SRE_MAXREPEAT)
Guido van Rossum8b762f02008-08-05 03:39:21 +00003070 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003071 if (!_validate_inner(code, code+skip-3, groups))
3072 FAIL;
3073 code += skip-3;
3074 GET_OP;
3075 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3076 FAIL;
3077 }
3078 break;
3079
3080 case SRE_OP_GROUPREF:
3081 case SRE_OP_GROUPREF_IGNORE:
3082 GET_ARG;
3083 if (arg >= groups)
3084 FAIL;
3085 break;
3086
3087 case SRE_OP_GROUPREF_EXISTS:
3088 /* The regex syntax for this is: '(?(group)then|else)', where
3089 'group' is either an integer group number or a group name,
3090 'then' and 'else' are sub-regexes, and 'else' is optional. */
3091 GET_ARG;
3092 if (arg >= groups)
3093 FAIL;
Guido van Rossume3c4fd92008-09-10 14:27:00 +00003094 GET_SKIP_ADJ(1);
Guido van Rossum8b762f02008-08-05 03:39:21 +00003095 code--; /* The skip is relative to the first arg! */
3096 /* There are two possibilities here: if there is both a 'then'
3097 part and an 'else' part, the generated code looks like:
3098
3099 GROUPREF_EXISTS
3100 <group>
3101 <skipyes>
3102 ...then part...
3103 JUMP
3104 <skipno>
3105 (<skipyes> jumps here)
3106 ...else part...
3107 (<skipno> jumps here)
3108
3109 If there is only a 'then' part, it looks like:
3110
3111 GROUPREF_EXISTS
3112 <group>
3113 <skip>
3114 ...then part...
3115 (<skip> jumps here)
3116
3117 There is no direct way to decide which it is, and we don't want
3118 to allow arbitrary jumps anywhere in the code; so we just look
3119 for a JUMP opcode preceding our skip target.
3120 */
3121 if (skip >= 3 && code+skip-3 >= code &&
3122 code[skip-3] == SRE_OP_JUMP)
3123 {
3124 VTRACE(("both then and else parts present\n"));
3125 if (!_validate_inner(code+1, code+skip-3, groups))
3126 FAIL;
3127 code += skip-2; /* Position after JUMP, at <skipno> */
3128 GET_SKIP;
3129 if (!_validate_inner(code, code+skip-1, groups))
3130 FAIL;
3131 code += skip-1;
3132 }
3133 else {
3134 VTRACE(("only a then part present\n"));
3135 if (!_validate_inner(code+1, code+skip-1, groups))
3136 FAIL;
3137 code += skip-1;
3138 }
3139 break;
3140
3141 case SRE_OP_ASSERT:
3142 case SRE_OP_ASSERT_NOT:
3143 GET_SKIP;
3144 GET_ARG; /* 0 for lookahead, width for lookbehind */
3145 code--; /* Back up over arg to simplify math below */
3146 if (arg & 0x80000000)
3147 FAIL; /* Width too large */
3148 /* Stop 1 before the end; we check the SUCCESS below */
3149 if (!_validate_inner(code+1, code+skip-2, groups))
3150 FAIL;
3151 code += skip-2;
3152 GET_OP;
3153 if (op != SRE_OP_SUCCESS)
3154 FAIL;
3155 break;
3156
3157 default:
3158 FAIL;
3159
3160 }
3161 }
3162
3163 VTRACE(("okay\n"));
3164 return 1;
3165}
3166
3167static int
3168_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3169{
3170 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3171 FAIL;
3172 if (groups == 0) /* fix for simplejson */
3173 groups = 100; /* 100 groups should always be safe */
3174 return _validate_inner(code, end-1, groups);
3175}
3176
3177static int
3178_validate(PatternObject *self)
3179{
3180 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3181 {
3182 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3183 return 0;
3184 }
3185 else
3186 VTRACE(("Success!\n"));
3187 return 1;
3188}
3189
3190/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003191/* match methods */
3192
3193static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003194match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003195{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003196 Py_XDECREF(self->regs);
3197 Py_XDECREF(self->string);
3198 Py_DECREF(self->pattern);
3199 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003200}
3201
3202static PyObject*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003203match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003204{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003205 if (index < 0 || index >= self->groups) {
3206 /* raise IndexError if we were given a bad group number */
3207 PyErr_SetString(
3208 PyExc_IndexError,
3209 "no such group"
3210 );
3211 return NULL;
3212 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003213
Fredrik Lundh6f013982000-07-03 18:44:21 +00003214 index *= 2;
3215
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003216 if (self->string == Py_None || self->mark[index] < 0) {
3217 /* return default value if the string or group is undefined */
3218 Py_INCREF(def);
3219 return def;
3220 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003221
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003222 return PySequence_GetSlice(
3223 self->string, self->mark[index], self->mark[index+1]
3224 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003225}
3226
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003227static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003228match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003229{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003230 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003231
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003232 if (PyInt_Check(index))
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003233 return PyInt_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003234
Fredrik Lundh6f013982000-07-03 18:44:21 +00003235 i = -1;
3236
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003237 if (self->pattern->groupindex) {
3238 index = PyObject_GetItem(self->pattern->groupindex, index);
3239 if (index) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003240 if (PyInt_Check(index) || PyLong_Check(index))
3241 i = PyInt_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003242 Py_DECREF(index);
3243 } else
3244 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003245 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003246
3247 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003248}
3249
3250static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003251match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003252{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003253 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003254}
3255
3256static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003257match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003258{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003259 /* delegate to Python code */
3260 return call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00003261 SRE_PY_MODULE, "_expand",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003262 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003263 );
3264}
3265
3266static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003267match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003268{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003269 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003270 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003271
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003272 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003273
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003274 switch (size) {
3275 case 0:
3276 result = match_getslice(self, Py_False, Py_None);
3277 break;
3278 case 1:
3279 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3280 break;
3281 default:
3282 /* fetch multiple items */
3283 result = PyTuple_New(size);
3284 if (!result)
3285 return NULL;
3286 for (i = 0; i < size; i++) {
3287 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003288 self, PyTuple_GET_ITEM(args, i), Py_None
3289 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003290 if (!item) {
3291 Py_DECREF(result);
3292 return NULL;
3293 }
3294 PyTuple_SET_ITEM(result, i, item);
3295 }
3296 break;
3297 }
3298 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003299}
3300
3301static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003302match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003303{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003304 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003305 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003306
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003307 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003308 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003309 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003310 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003311
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003312 result = PyTuple_New(self->groups-1);
3313 if (!result)
3314 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003315
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003316 for (index = 1; index < self->groups; index++) {
3317 PyObject* item;
3318 item = match_getslice_by_index(self, index, def);
3319 if (!item) {
3320 Py_DECREF(result);
3321 return NULL;
3322 }
3323 PyTuple_SET_ITEM(result, index-1, item);
3324 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003325
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003326 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003327}
3328
3329static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003330match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003331{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003332 PyObject* result;
3333 PyObject* keys;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003334 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003335
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003336 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003337 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003338 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003339 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003340
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003341 result = PyDict_New();
3342 if (!result || !self->pattern->groupindex)
3343 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003345 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003346 if (!keys)
3347 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003349 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003350 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003352 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003353 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003354 if (!key)
3355 goto failed;
3356 value = match_getslice(self, key, def);
3357 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003358 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003359 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003360 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003361 status = PyDict_SetItem(result, key, value);
3362 Py_DECREF(value);
3363 if (status < 0)
3364 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003365 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003366
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003367 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003368
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003369 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003370
3371failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003372 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003373 Py_DECREF(result);
3374 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003375}
3376
3377static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003378match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003379{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003380 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003381
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003382 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003383 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003384 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003385
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003386 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003387
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003388 if (index < 0 || index >= self->groups) {
3389 PyErr_SetString(
3390 PyExc_IndexError,
3391 "no such group"
3392 );
3393 return NULL;
3394 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003395
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003396 /* mark is -1 if group is undefined */
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003397 return PyInt_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003398}
3399
3400static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003401match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003402{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003403 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003404
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003405 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003406 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003407 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003408
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003409 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003410
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003411 if (index < 0 || index >= self->groups) {
3412 PyErr_SetString(
3413 PyExc_IndexError,
3414 "no such group"
3415 );
3416 return NULL;
3417 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003418
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003419 /* mark is -1 if group is undefined */
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003420 return PyInt_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003421}
3422
3423LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003424_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003425{
3426 PyObject* pair;
3427 PyObject* item;
3428
3429 pair = PyTuple_New(2);
3430 if (!pair)
3431 return NULL;
3432
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003433 item = PyInt_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003434 if (!item)
3435 goto error;
3436 PyTuple_SET_ITEM(pair, 0, item);
3437
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003438 item = PyInt_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003439 if (!item)
3440 goto error;
3441 PyTuple_SET_ITEM(pair, 1, item);
3442
3443 return pair;
3444
3445 error:
3446 Py_DECREF(pair);
3447 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003448}
3449
3450static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003451match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003452{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003453 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003454
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003455 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003456 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003457 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003458
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003459 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003460
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003461 if (index < 0 || index >= self->groups) {
3462 PyErr_SetString(
3463 PyExc_IndexError,
3464 "no such group"
3465 );
3466 return NULL;
3467 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003468
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003469 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003470 return _pair(self->mark[index*2], self->mark[index*2+1]);
3471}
3472
3473static PyObject*
3474match_regs(MatchObject* self)
3475{
3476 PyObject* regs;
3477 PyObject* item;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003478 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003479
3480 regs = PyTuple_New(self->groups);
3481 if (!regs)
3482 return NULL;
3483
3484 for (index = 0; index < self->groups; index++) {
3485 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3486 if (!item) {
3487 Py_DECREF(regs);
3488 return NULL;
3489 }
3490 PyTuple_SET_ITEM(regs, index, item);
3491 }
3492
3493 Py_INCREF(regs);
3494 self->regs = regs;
3495
3496 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003497}
3498
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003499static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003500match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003501{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003502#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003503 MatchObject* copy;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003504 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003505
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003506 slots = 2 * (self->pattern->groups+1);
3507
3508 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3509 if (!copy)
3510 return NULL;
3511
3512 /* this value a constant, but any compiler should be able to
3513 figure that out all by itself */
3514 offset = offsetof(MatchObject, string);
3515
3516 Py_XINCREF(self->pattern);
3517 Py_XINCREF(self->string);
3518 Py_XINCREF(self->regs);
3519
3520 memcpy((char*) copy + offset, (char*) self + offset,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003521 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003522
3523 return (PyObject*) copy;
3524#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003525 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003526 return NULL;
3527#endif
3528}
3529
3530static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003531match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003532{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003533#ifdef USE_BUILTIN_COPY
3534 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003535
Georg Brandlfbef5882006-05-28 22:14:04 +00003536 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003537 if (!copy)
3538 return NULL;
3539
3540 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3541 !deepcopy(&copy->string, memo) ||
3542 !deepcopy(&copy->regs, memo)) {
3543 Py_DECREF(copy);
3544 return NULL;
3545 }
3546
3547#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003548 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3549 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003550#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003551}
3552
Andrew Svetlov1c6c90f2012-12-23 20:09:01 +02003553PyDoc_STRVAR(match_doc,
3554"The result of re.match() and re.search().\n\
3555Match objects always have a boolean value of True.");
3556
3557PyDoc_STRVAR(match_group_doc,
3558"group([group1, ...]) -> str or tuple.\n\
3559 Return subgroup(s) of the match by indices or names.\n\
3560 For 0 returns the entire match.");
3561
3562PyDoc_STRVAR(match_start_doc,
3563"start([group=0]) -> int.\n\
3564 Return index of the start of the substring matched by group.");
3565
3566PyDoc_STRVAR(match_end_doc,
3567"end([group=0]) -> int.\n\
3568 Return index of the end of the substring matched by group.");
3569
3570PyDoc_STRVAR(match_span_doc,
3571"span([group]) -> tuple.\n\
3572 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3573
3574PyDoc_STRVAR(match_groups_doc,
3575"groups([default=None]) -> tuple.\n\
3576 Return a tuple containing all the subgroups of the match, from 1.\n\
3577 The default argument is used for groups\n\
3578 that did not participate in the match");
3579
3580PyDoc_STRVAR(match_groupdict_doc,
3581"groupdict([default=None]) -> dict.\n\
3582 Return a dictionary containing all the named subgroups of the match,\n\
3583 keyed by the subgroup name. The default argument is used for groups\n\
3584 that did not participate in the match");
3585
3586PyDoc_STRVAR(match_expand_doc,
3587"expand(template) -> str.\n\
3588 Return the string obtained by doing backslash substitution\n\
3589 on the string template, as done by the sub() method.");
3590
3591static PyMethodDef match_methods[] = {
3592 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3593 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3594 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3595 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3596 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3597 match_groups_doc},
3598 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3599 match_groupdict_doc},
3600 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Georg Brandlfbef5882006-05-28 22:14:04 +00003601 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3602 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003603 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003604};
3605
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003606static PyObject *
3607match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003608{
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003609 if (self->lastindex >= 0)
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003610 return PyInt_FromSsize_t(self->lastindex);
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003611 Py_INCREF(Py_None);
3612 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003613}
3614
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003615static PyObject *
3616match_lastgroup_get(MatchObject *self)
3617{
3618 if (self->pattern->indexgroup && self->lastindex >= 0) {
3619 PyObject* result = PySequence_GetItem(
3620 self->pattern->indexgroup, self->lastindex
3621 );
3622 if (result)
3623 return result;
3624 PyErr_Clear();
3625 }
3626 Py_INCREF(Py_None);
3627 return Py_None;
3628}
3629
3630static PyObject *
3631match_regs_get(MatchObject *self)
3632{
3633 if (self->regs) {
3634 Py_INCREF(self->regs);
3635 return self->regs;
3636 } else
3637 return match_regs(self);
3638}
3639
3640static PyGetSetDef match_getset[] = {
3641 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3642 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3643 {"regs", (getter)match_regs_get, (setter)NULL},
3644 {NULL}
3645};
3646
3647#define MATCH_OFF(x) offsetof(MatchObject, x)
3648static PyMemberDef match_members[] = {
3649 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3650 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3651 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3652 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3653 {NULL}
3654};
3655
3656
Guido van Rossumb700df92000-03-31 14:59:30 +00003657/* FIXME: implement setattr("string", None) as a special case (to
3658 detach the associated string, if any */
3659
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003660static PyTypeObject Match_Type = {
3661 PyVarObject_HEAD_INIT(NULL, 0)
3662 "_" SRE_MODULE ".SRE_Match",
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003663 sizeof(MatchObject), sizeof(Py_ssize_t),
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003664 (destructor)match_dealloc, /* tp_dealloc */
3665 0, /* tp_print */
3666 0, /* tp_getattr */
3667 0, /* tp_setattr */
3668 0, /* tp_compare */
3669 0, /* tp_repr */
3670 0, /* tp_as_number */
3671 0, /* tp_as_sequence */
3672 0, /* tp_as_mapping */
3673 0, /* tp_hash */
3674 0, /* tp_call */
3675 0, /* tp_str */
3676 0, /* tp_getattro */
3677 0, /* tp_setattro */
3678 0, /* tp_as_buffer */
3679 Py_TPFLAGS_DEFAULT,
Andrew Svetlov1c6c90f2012-12-23 20:09:01 +02003680 match_doc, /* tp_doc */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003681 0, /* tp_traverse */
3682 0, /* tp_clear */
3683 0, /* tp_richcompare */
3684 0, /* tp_weaklistoffset */
3685 0, /* tp_iter */
3686 0, /* tp_iternext */
3687 match_methods, /* tp_methods */
3688 match_members, /* tp_members */
3689 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003690};
3691
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003692static PyObject*
3693pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3694{
3695 /* create match object (from state object) */
3696
3697 MatchObject* match;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003698 Py_ssize_t i, j;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003699 char* base;
3700 int n;
3701
3702 if (status > 0) {
3703
3704 /* create match object (with room for extra group marks) */
Christian Heimes4956d2b2008-01-18 19:12:56 +00003705 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003706 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3707 2*(pattern->groups+1));
3708 if (!match)
3709 return NULL;
3710
3711 Py_INCREF(pattern);
3712 match->pattern = pattern;
3713
3714 Py_INCREF(state->string);
3715 match->string = state->string;
3716
3717 match->regs = NULL;
3718 match->groups = pattern->groups+1;
3719
3720 /* fill in group slices */
3721
3722 base = (char*) state->beginning;
3723 n = state->charsize;
3724
3725 match->mark[0] = ((char*) state->start - base) / n;
3726 match->mark[1] = ((char*) state->ptr - base) / n;
3727
3728 for (i = j = 0; i < pattern->groups; i++, j+=2)
3729 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3730 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3731 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3732 } else
3733 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3734
3735 match->pos = state->pos;
3736 match->endpos = state->endpos;
3737
3738 match->lastindex = state->lastindex;
3739
3740 return (PyObject*) match;
3741
3742 } else if (status == 0) {
3743
3744 /* no match */
3745 Py_INCREF(Py_None);
3746 return Py_None;
3747
3748 }
3749
3750 /* internal error */
3751 pattern_error(status);
3752 return NULL;
3753}
3754
3755
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003756/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003757/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003758
3759static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003760scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003761{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003762 state_fini(&self->state);
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003763 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003764 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003765}
3766
3767static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003768scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003769{
3770 SRE_STATE* state = &self->state;
3771 PyObject* match;
3772 int status;
3773
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003774 state_reset(state);
3775
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003776 state->ptr = state->start;
3777
3778 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003779 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003780 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003781#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003782 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003783#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003784 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003785 if (PyErr_Occurred())
3786 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003787
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003788 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003789 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003790
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003791 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003792 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003793 else
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003794 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003795
3796 return match;
3797}
3798
3799
3800static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003801scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003802{
3803 SRE_STATE* state = &self->state;
3804 PyObject* match;
3805 int status;
3806
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003807 state_reset(state);
3808
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003809 state->ptr = state->start;
3810
3811 if (state->charsize == 1) {
3812 status = sre_search(state, PatternObject_GetCode(self->pattern));
3813 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003814#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003815 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003816#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003817 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003818 if (PyErr_Occurred())
3819 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003820
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003821 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003822 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003823
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003824 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003825 state->start = (void*) ((char*) state->ptr + state->charsize);
3826 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003827 state->start = state->ptr;
3828
3829 return match;
3830}
3831
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003832static PyMethodDef scanner_methods[] = {
Georg Brandlfbef5882006-05-28 22:14:04 +00003833 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3834 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003835 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003836};
3837
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003838#define SCAN_OFF(x) offsetof(ScannerObject, x)
3839static PyMemberDef scanner_members[] = {
3840 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3841 {NULL} /* Sentinel */
3842};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003843
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003844statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003845 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003846 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003847 sizeof(ScannerObject), 0,
3848 (destructor)scanner_dealloc, /*tp_dealloc*/
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003849 0, /* tp_print */
3850 0, /* tp_getattr */
3851 0, /* tp_setattr */
3852 0, /* tp_reserved */
3853 0, /* tp_repr */
3854 0, /* tp_as_number */
3855 0, /* tp_as_sequence */
3856 0, /* tp_as_mapping */
3857 0, /* tp_hash */
3858 0, /* tp_call */
3859 0, /* tp_str */
3860 0, /* tp_getattro */
3861 0, /* tp_setattro */
3862 0, /* tp_as_buffer */
3863 Py_TPFLAGS_DEFAULT, /* tp_flags */
3864 0, /* tp_doc */
3865 0, /* tp_traverse */
3866 0, /* tp_clear */
3867 0, /* tp_richcompare */
3868 0, /* tp_weaklistoffset */
3869 0, /* tp_iter */
3870 0, /* tp_iternext */
3871 scanner_methods, /* tp_methods */
3872 scanner_members, /* tp_members */
3873 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003874};
3875
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003876static PyObject*
3877pattern_scanner(PatternObject* pattern, PyObject* args)
3878{
3879 /* create search state object */
3880
3881 ScannerObject* self;
3882
3883 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003884 Py_ssize_t start = 0;
3885 Py_ssize_t end = PY_SSIZE_T_MAX;
3886 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003887 return NULL;
3888
3889 /* create scanner object */
3890 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3891 if (!self)
3892 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003893 self->pattern = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003894
3895 string = state_init(&self->state, pattern, string, start, end);
3896 if (!string) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003897 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003898 return NULL;
3899 }
3900
3901 Py_INCREF(pattern);
3902 self->pattern = (PyObject*) pattern;
3903
3904 return (PyObject*) self;
3905}
3906
Guido van Rossumb700df92000-03-31 14:59:30 +00003907static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003908 {"compile", _compile, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00003909 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003910 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003911 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003912};
3913
Tim Peters3d563502006-01-21 02:47:53 +00003914#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003915DL_EXPORT(void) init_sre(void)
3916#else
Mark Hammond8235ea12002-07-19 06:55:41 +00003917PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00003918#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00003919{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003920 PyObject* m;
3921 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003922 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003924 /* Patch object types */
Benjamin Petersone266d3e2010-04-06 03:34:09 +00003925 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3926 PyType_Ready(&Scanner_Type))
3927 return;
Guido van Rossumb700df92000-03-31 14:59:30 +00003928
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00003929 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003930 if (m == NULL)
3931 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003932 d = PyModule_GetDict(m);
3933
Fredrik Lundh21009b92001-09-18 18:47:09 +00003934 x = PyInt_FromLong(SRE_MAGIC);
3935 if (x) {
3936 PyDict_SetItemString(d, "MAGIC", x);
3937 Py_DECREF(x);
3938 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003939
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003940 x = PyInt_FromLong(sizeof(SRE_CODE));
3941 if (x) {
3942 PyDict_SetItemString(d, "CODESIZE", x);
3943 Py_DECREF(x);
3944 }
3945
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003946 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3947 if (x) {
3948 PyDict_SetItemString(d, "MAXREPEAT", x);
3949 Py_DECREF(x);
3950 }
3951
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003952 x = PyString_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003953 if (x) {
3954 PyDict_SetItemString(d, "copyright", x);
3955 Py_DECREF(x);
3956 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003957}
3958
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003959#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003960
3961/* vim:ts=4:sw=4:et
3962*/