blob: 6fd3affb09ab43f0a8bb1c2c1ecd762839d945df [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Neal Norwitza6d80fa2006-06-12 03:05:40 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d52000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000055
Neal Norwitz94a9c092006-03-16 06:30:02 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh971e78b2001-10-20 17:48:46 +000061#if PY_VERSION_HEX >= 0x01060000
62#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
Fredrik Lundh22d25462000-07-01 17:50:59 +000063/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d52000-06-29 08:58:44 +000064#define HAVE_UNICODE
65#endif
Fredrik Lundh971e78b2001-10-20 17:48:46 +000066#endif
Fredrik Lundh436c3d52000-06-29 08:58:44 +000067
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000068/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000069/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070
71/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000072#define USE_FAST_SEARCH
73
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000074/* enables aggressive inlining (always on for Visual C) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000075#undef USE_INLINE
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000076
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000077/* enables copy/deepcopy handling (work in progress) */
78#undef USE_BUILTIN_COPY
79
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000080#if PY_VERSION_HEX < 0x01060000
81#define PyObject_DEL(op) PyMem_DEL((op))
82#endif
83
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084/* -------------------------------------------------------------------- */
85
Fredrik Lundh80946112000-06-29 18:03:25 +000086#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000087#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000088#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000089/* fastest possible local call under MSVC */
90#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000092#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000093#else
94#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000095#endif
96
97/* error codes */
98#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000099#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000100#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +0000101#define SRE_ERROR_MEMORY -9 /* out of memory */
Facundo Batista4473d222008-01-08 21:10:12 +0000102#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +0000103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000104#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +0000105#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +0000106#else
107#define TRACE(v)
108#endif
109
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000110/* -------------------------------------------------------------------- */
111/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000112
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000113/* default character predicates (run sre_chars.py to regenerate tables) */
114
115#define SRE_DIGIT_MASK 1
116#define SRE_SPACE_MASK 2
117#define SRE_LINEBREAK_MASK 4
118#define SRE_ALNUM_MASK 8
119#define SRE_WORD_MASK 16
120
Fredrik Lundh21009b92001-09-18 18:47:09 +0000121/* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000123static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1242, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1280, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
Fredrik Lundhb389df32000-06-29 12:48:37 +0000131static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d52000-06-29 08:58:44 +000013210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139120, 121, 122, 123, 124, 125, 126, 127 };
140
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000141#define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143#define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145#define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147#define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149#define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000151
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000152static unsigned int sre_lower(unsigned int ch)
153{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000155}
156
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000158/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000162#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000163#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000164#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166static unsigned int sre_lower_locale(unsigned int ch)
167{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000169}
170
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000171/* unicode-specific character predicates */
172
173#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000174
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000175#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000176#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
Fredrik Lundh22d25462000-07-01 17:50:59 +0000178#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000179#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000180
181static unsigned int sre_lower_unicode(unsigned int ch)
182{
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184}
185
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000186#endif
187
Guido van Rossumb700df92000-03-31 14:59:30 +0000188LOCAL(int)
189sre_category(SRE_CODE category, unsigned int ch)
190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000191 switch (category) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000209
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000214
215#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000232#else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000249#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000250 }
251 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000252}
253
254/* helpers */
255
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000256static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000257data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000258{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000259 if (state->data_stack) {
Jack Diederich2d400772006-05-27 15:44:34 +0000260 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000261 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000263 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000264}
265
266static int
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000267data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000268{
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000269 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300275 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Jack Diederich2d400772006-05-27 15:44:34 +0000276 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000277 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279 return SRE_ERROR_MEMORY;
280 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000281 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000282 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000283 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000284 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000285}
286
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000287/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000288
289#define SRE_CHAR unsigned char
290#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_charset
293#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_search
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000297#define SRE_LITERAL_TEMPLATE sre_literal_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000298
299#if defined(HAVE_UNICODE)
300
Guido van Rossumb700df92000-03-31 14:59:30 +0000301#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000302#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000303#undef SRE_RECURSIVE
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000304
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000305#undef SRE_LITERAL_TEMPLATE
Guido van Rossumb700df92000-03-31 14:59:30 +0000306#undef SRE_SEARCH
307#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000308#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000309#undef SRE_INFO
310#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000311#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000312#undef SRE_AT
313#undef SRE_CHAR
314
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000315/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000316
317#define SRE_CHAR Py_UNICODE
318#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000319#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000320#define SRE_CHARSET sre_ucharset
321#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000322#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000323#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000324#define SRE_SEARCH sre_usearch
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000325#define SRE_LITERAL_TEMPLATE sre_uliteral_template
Fredrik Lundh436c3d52000-06-29 08:58:44 +0000326#endif
Guido van Rossumb700df92000-03-31 14:59:30 +0000327
328#endif /* SRE_RECURSIVE */
329
330/* -------------------------------------------------------------------- */
331/* String matching engine */
332
333/* the following section is compiled twice, with different character
334 settings */
335
336LOCAL(int)
337SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000339 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000340
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000341 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000343 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000344
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000345 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000346 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000347 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000348
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000351 SRE_IS_LINEBREAK((int) ptr[-1]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000352
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000353 case SRE_AT_END:
Serhiy Storchaka7865f212015-07-06 13:58:24 +0300354 return (((SRE_CHAR *)state->end - ptr == 1 &&
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000357
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000360 SRE_IS_LINEBREAK((int) ptr[0]));
Fredrik Lundh80946112000-06-29 18:03:25 +0000361
Fredrik Lundh770617b2001-01-14 15:06:11 +0000362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000368 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000369 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000370 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000371 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000372 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000373
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000377 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000378 SRE_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000379 thisp = ((void*) ptr < state->end) ?
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000380 SRE_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000381 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000386 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000388 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000390 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000395 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000397 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000399 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000400
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000401#if defined(HAVE_UNICODE)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000405 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000407 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000409 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000414 thatp = ((void*) ptr > state->beginning) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000416 thisp = ((void*) ptr < state->end) ?
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000418 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000419#endif
420
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000421 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000422
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000423 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000424}
425
426LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000427SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000428{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000430
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000431 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000432
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000433 for (;;) {
434 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_FAILURE:
437 return !ok;
438
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000439 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000440 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000449 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000450 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000451 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000452
Fredrik Lundh3562f112000-07-02 12:00:07 +0000453 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000459 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith64ab35e2012-12-10 17:45:54 -0800462 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000463 return ok;
464 set += 8;
465 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000466 break;
467
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000482 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000483 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000500 if (block >=0 &&
Gregory P. Smith64ab35e2012-12-10 17:45:54 -0800501 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000502 return ok;
503 set += count*8;
504 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000505 break;
506 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000507
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000508 default:
509 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000510 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000511 return 0;
512 }
513 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000514}
515
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000516LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000518LOCAL(Py_ssize_t)
519SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000520{
521 SRE_CODE chr;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000524 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000525
526 /* adjust end */
Serhiy Storchakae18e05c2013-02-16 16:47:15 +0200527 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000547 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000548 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
Tim Peters3d563502006-01-21 02:47:53 +0000576
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000585 default:
586 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000588 while ((SRE_CHAR*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000589 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300595 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000596 (SRE_CHAR*) state->ptr - ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300600 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
601 ptr - (SRE_CHAR*) state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000602 return ptr - (SRE_CHAR*) state->ptr;
603}
604
Fredrik Lundh33accc12000-08-27 20:59:47 +0000605#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000606LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000607SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608{
609 /* check if an SRE_OP_INFO block matches at the current position.
610 returns the number of SRE_CODE objects to skip if successful, 0
611 if no match */
612
613 SRE_CHAR* end = state->end;
614 SRE_CHAR* ptr = state->ptr;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000615 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000616
617 /* check minimal length */
618 if (pattern[3] && (end - ptr) < pattern[3])
619 return 0;
620
621 /* check known prefix */
622 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
623 /* <length> <skip> <prefix data> <overlap data> */
624 for (i = 0; i < pattern[5]; i++)
625 if ((SRE_CODE) ptr[i] != pattern[7 + i])
626 return 0;
627 return pattern[0] + 2 * pattern[6];
628 }
629 return pattern[0];
630}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000631#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000632
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000633/* The macros below should be used to protect recursive SRE_MATCH()
634 * calls that *failed* and do *not* return immediately (IOW, those
635 * that will backtrack). Explaining:
636 *
637 * - Recursive SRE_MATCH() returned true: that's usually a success
638 * (besides atypical cases like ASSERT_NOT), therefore there's no
639 * reason to restore lastmark;
640 *
641 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642 * is returning to the caller: If the current SRE_MATCH() is the
643 * top function of the recursion, returning false will be a matching
644 * failure, and it doesn't matter where lastmark is pointing to.
645 * If it's *not* the top function, it will be a recursive SRE_MATCH()
646 * failure by itself, and the calling SRE_MATCH() will have to deal
647 * with the failure by the same rules explained here (it will restore
648 * lastmark by itself if necessary);
649 *
650 * - Recursive SRE_MATCH() returned false, and will continue the
651 * outside 'for' loop: must be protected when breaking, since the next
652 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000653 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000654 * - Recursive SRE_MATCH() returned false, and will be called again
655 * inside a local for/while loop: must be protected between each
656 * loop iteration, since the recursive SRE_MATCH() could do anything,
657 * and could potentially depend on lastmark.
658 *
659 * For more information, check the discussion at SF patch #712900.
660 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000661#define LASTMARK_SAVE() \
662 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000663 ctx->lastmark = state->lastmark; \
664 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000665 } while (0)
666#define LASTMARK_RESTORE() \
667 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000668 state->lastmark = ctx->lastmark; \
669 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000670 } while (0)
671
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000672#define RETURN_ERROR(i) do { return i; } while(0)
673#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675
676#define RETURN_ON_ERROR(i) \
677 do { if (i < 0) RETURN_ERROR(i); } while (0)
678#define RETURN_ON_SUCCESS(i) \
679 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680#define RETURN_ON_FAILURE(i) \
681 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
682
683#define SFY(x) #x
684
685#define DATA_STACK_ALLOC(state, type, ptr) \
686do { \
687 alloc_pos = state->data_stack_base; \
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300688 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000690 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +0300691 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000692 int j = data_stack_grow(state, sizeof(type)); \
693 if (j < 0) return j; \
694 if (ctx_pos != -1) \
695 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
696 } \
697 ptr = (type*)(state->data_stack+alloc_pos); \
698 state->data_stack_base += sizeof(type); \
699} while (0)
700
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000701#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
702do { \
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300703 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000704 ptr = (type*)(state->data_stack+pos); \
705} while (0)
706
707#define DATA_STACK_PUSH(state, data, size) \
708do { \
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300709 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000711 data, state->data_stack_base, size)); \
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +0300712 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000713 int j = data_stack_grow(state, size); \
714 if (j < 0) return j; \
715 if (ctx_pos != -1) \
716 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717 } \
718 memcpy(state->data_stack+state->data_stack_base, data, size); \
719 state->data_stack_base += size; \
720} while (0)
721
722#define DATA_STACK_POP(state, data, size, discard) \
723do { \
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300724 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728 if (discard) \
729 state->data_stack_base -= size; \
730} while (0)
731
732#define DATA_STACK_POP_DISCARD(state, size) \
733do { \
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300734 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735 "(%" PY_FORMAT_SIZE_T "d)\n", \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000736 state->data_stack_base-size, size)); \
737 state->data_stack_base -= size; \
738} while(0)
739
740#define DATA_PUSH(x) \
741 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742#define DATA_POP(x) \
743 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000744#define DATA_POP_DISCARD(x) \
745 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746#define DATA_ALLOC(t,p) \
747 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000748#define DATA_LOOKUP_AT(t,p,pos) \
749 DATA_STACK_LOOKUP_AT(state,t,p,pos)
750
751#define MARK_PUSH(lastmark) \
752 do if (lastmark > 0) { \
753 i = lastmark; /* ctx->lastmark may change if reallocated */ \
754 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755 } while (0)
756#define MARK_POP(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759 } while (0)
760#define MARK_POP_KEEP(lastmark) \
761 do if (lastmark > 0) { \
762 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763 } while (0)
764#define MARK_POP_DISCARD(lastmark) \
765 do if (lastmark > 0) { \
766 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
767 } while (0)
768
769#define JUMP_NONE 0
770#define JUMP_MAX_UNTIL_1 1
771#define JUMP_MAX_UNTIL_2 2
772#define JUMP_MAX_UNTIL_3 3
773#define JUMP_MIN_UNTIL_1 4
774#define JUMP_MIN_UNTIL_2 5
775#define JUMP_MIN_UNTIL_3 6
776#define JUMP_REPEAT 7
777#define JUMP_REPEAT_ONE_1 8
778#define JUMP_REPEAT_ONE_2 9
779#define JUMP_MIN_REPEAT_ONE 10
780#define JUMP_BRANCH 11
781#define JUMP_ASSERT 12
782#define JUMP_ASSERT_NOT 13
783
784#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786 nextctx->last_ctx_pos = ctx_pos; \
787 nextctx->jump = jumpvalue; \
788 nextctx->pattern = nextpattern; \
789 ctx_pos = alloc_pos; \
790 ctx = nextctx; \
791 goto entrance; \
792 jumplabel: \
793 while (0) /* gcc doesn't like labels at end of scopes */ \
794
795typedef struct {
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000796 Py_ssize_t last_ctx_pos;
797 Py_ssize_t jump;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 SRE_CHAR* ptr;
799 SRE_CODE* pattern;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000800 Py_ssize_t count;
801 Py_ssize_t lastmark;
802 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000803 union {
804 SRE_CODE chr;
805 SRE_REPEAT* rep;
806 } u;
807} SRE_MATCH_CONTEXT;
808
809/* check if string matches the given pattern. returns <0 for
810 error, 0 for failure, and 1 for success */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000811LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000812SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000813{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000814 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000815 Py_ssize_t alloc_pos, ctx_pos = -1;
816 Py_ssize_t i, ret = 0;
817 Py_ssize_t jump;
Facundo Batista4473d222008-01-08 21:10:12 +0000818 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000819
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 SRE_MATCH_CONTEXT* ctx;
821 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000822
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000823 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000824
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000825 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
826 ctx->last_ctx_pos = -1;
827 ctx->jump = JUMP_NONE;
828 ctx->pattern = pattern;
829 ctx_pos = alloc_pos;
830
831entrance:
832
Anthony Baxteraefd8ca2006-04-12 04:26:11 +0000833 ctx->ptr = (SRE_CHAR *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000834
835 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000836 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000837 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000838 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
Serhiy Storchakacf29ba82013-09-05 18:02:57 +0300839 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
840 "need %" PY_FORMAT_SIZE_T "d)\n",
841 (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000842 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000843 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000844 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000845 }
846
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000847 for (;;) {
Facundo Batista4473d222008-01-08 21:10:12 +0000848 ++sigcount;
849 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
850 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000851
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000852 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000853
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000854 case SRE_OP_MARK:
855 /* set mark */
856 /* <MARK> <gid> */
857 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
858 ctx->ptr, ctx->pattern[0]));
859 i = ctx->pattern[0];
860 if (i & 1)
861 state->lastindex = i/2 + 1;
862 if (i > state->lastmark) {
863 /* state->lastmark is the highest valid index in the
864 state->mark array. If it is increased by more than 1,
865 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000866 that these marks have not been encountered. */
Neal Norwitza6d80fa2006-06-12 03:05:40 +0000867 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000868 while (j < i)
869 state->mark[j++] = NULL;
870 state->lastmark = i;
871 }
872 state->mark[i] = ctx->ptr;
873 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000874 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000875
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000876 case SRE_OP_LITERAL:
877 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000878 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000879 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000885 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000886
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000887 case SRE_OP_NOT_LITERAL:
888 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000889 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000890 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
891 ctx->ptr, *ctx->pattern));
892 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
893 RETURN_FAILURE;
894 ctx->pattern++;
895 ctx->ptr++;
896 break;
897
898 case SRE_OP_SUCCESS:
899 /* end of pattern */
900 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
901 state->ptr = ctx->ptr;
902 RETURN_SUCCESS;
903
904 case SRE_OP_AT:
905 /* match at given position */
906 /* <AT> <code> */
907 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
908 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 break;
912
913 case SRE_OP_CATEGORY:
914 /* match at given category */
915 /* <CATEGORY> <code> */
916 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
917 ctx->ptr, *ctx->pattern));
918 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->pattern++;
921 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000922 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000923
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000924 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000925 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000926 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000927 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
928 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
929 RETURN_FAILURE;
930 ctx->ptr++;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000931 break;
932
933 case SRE_OP_ANY_ALL:
934 /* match anything */
935 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000936 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937 if (ctx->ptr >= end)
938 RETURN_FAILURE;
939 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000940 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000941
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000942 case SRE_OP_IN:
943 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000944 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000945 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
946 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
947 RETURN_FAILURE;
948 ctx->pattern += ctx->pattern[0];
949 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000950 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000951
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000953 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, ctx->pattern[0]));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000960 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000961
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000962 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000963 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964 ctx->pattern, ctx->ptr, *ctx->pattern));
965 if (ctx->ptr >= end ||
966 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
967 RETURN_FAILURE;
968 ctx->pattern++;
969 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000970 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000971
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000972 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
974 if (ctx->ptr >= end
975 || !SRE_CHARSET(ctx->pattern+1,
976 (SRE_CODE)state->lower(*ctx->ptr)))
977 RETURN_FAILURE;
978 ctx->pattern += ctx->pattern[0];
979 ctx->ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000980 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000981
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000982 case SRE_OP_JUMP:
983 case SRE_OP_INFO:
984 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000985 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000986 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
987 ctx->ptr, ctx->pattern[0]));
988 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000989 break;
990
991 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000992 /* alternation */
993 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000994 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000995 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000996 ctx->u.rep = state->repeat;
997 if (ctx->u.rep)
998 MARK_PUSH(ctx->lastmark);
999 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1000 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1001 (ctx->ptr >= end ||
1002 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001003 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001004 if (ctx->pattern[1] == SRE_OP_IN &&
1005 (ctx->ptr >= end ||
1006 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001007 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001008 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001009 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001010 if (ret) {
1011 if (ctx->u.rep)
1012 MARK_POP_DISCARD(ctx->lastmark);
1013 RETURN_ON_ERROR(ret);
1014 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001015 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001016 if (ctx->u.rep)
1017 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001018 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001019 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001020 if (ctx->u.rep)
1021 MARK_POP_DISCARD(ctx->lastmark);
1022 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001023
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001024 case SRE_OP_REPEAT_ONE:
1025 /* match repeated sequence (maximizing regexp) */
1026
1027 /* this operator only works if the repeated item is
1028 exactly one character wide, and we're not already
1029 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +00001030 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031
1032 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1033
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001034 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1035 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001036
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001037 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001038 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001039
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001041
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001042 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1043 RETURN_ON_ERROR(ret);
1044 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1045 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001046 ctx->ptr += ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001047
1048 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001049 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001050 string. check if the rest of the pattern matches,
1051 and backtrack if not. */
1052
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001053 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001054 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001055
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001056 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001057 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 state->ptr = ctx->ptr;
1059 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001060 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001061
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001062 LASTMARK_SAVE();
1063
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001065 /* tail starts with a literal. skip positions where
1066 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001067 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001068 for (;;) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001069 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1071 ctx->ptr--;
1072 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001073 }
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001074 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001075 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001076 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001077 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1078 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 if (ret) {
1080 RETURN_ON_ERROR(ret);
1081 RETURN_SUCCESS;
1082 }
Tim Peters3d563502006-01-21 02:47:53 +00001083
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001084 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001085
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001086 ctx->ptr--;
1087 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001088 }
1089
1090 } else {
1091 /* general case */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001092 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001093 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001094 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1095 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001096 if (ret) {
1097 RETURN_ON_ERROR(ret);
1098 RETURN_SUCCESS;
1099 }
1100 ctx->ptr--;
1101 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001102 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001103 }
1104 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001105 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001106
Guido van Rossum41c99e72003-04-14 17:59:34 +00001107 case SRE_OP_MIN_REPEAT_ONE:
1108 /* match repeated sequence (minimizing regexp) */
1109
1110 /* this operator only works if the repeated item is
1111 exactly one character wide, and we're not already
1112 collecting backtracking points. for other cases,
1113 use the MIN_REPEAT operator */
1114
1115 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1116
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1118 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001119
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001120 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001121 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001122
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001123 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001124
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ctx->pattern[1] == 0)
1126 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001127 else {
1128 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001129 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1130 RETURN_ON_ERROR(ret);
1131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001132 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001133 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001134 RETURN_FAILURE;
1135 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001136 ctx->count = ret;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001137 ctx->ptr += ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001138 }
1139
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001140 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001141 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001142 state->ptr = ctx->ptr;
1143 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001144
1145 } else {
1146 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001147 LASTMARK_SAVE();
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001148 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001149 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001150 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001151 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1152 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001153 if (ret) {
1154 RETURN_ON_ERROR(ret);
1155 RETURN_SUCCESS;
1156 }
1157 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001158 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001160 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001161 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001162 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001163 assert(ret == 1);
1164 ctx->ptr++;
1165 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001166 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001167 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001168 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001169 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001170
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001171 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001172 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001173 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001174 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001175 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1176 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001177
1178 /* install new repeat context */
Jack Diederich2d400772006-05-27 15:44:34 +00001179 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001180 if (!ctx->u.rep) {
1181 PyErr_NoMemory();
Neal Norwitzef0de022006-08-12 01:53:28 +00001182 RETURN_FAILURE;
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001183 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001184 ctx->u.rep->count = -1;
1185 ctx->u.rep->pattern = ctx->pattern;
1186 ctx->u.rep->prev = state->repeat;
1187 ctx->u.rep->last_ptr = NULL;
1188 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001189
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001190 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001191 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001192 state->repeat = ctx->u.rep->prev;
Jack Diederich2d400772006-05-27 15:44:34 +00001193 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001194
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 if (ret) {
1196 RETURN_ON_ERROR(ret);
1197 RETURN_SUCCESS;
1198 }
1199 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200
1201 case SRE_OP_MAX_UNTIL:
1202 /* maximizing repeat */
1203 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1204
1205 /* FIXME: we probably need to deal with zero-width
1206 matches in here... */
1207
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 ctx->u.rep = state->repeat;
1209 if (!ctx->u.rep)
1210 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001211
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001212 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001213
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001214 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001215
Serhiy Storchakacf29ba82013-09-05 18:02:57 +03001216 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001217 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001218
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001219 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001220 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001221 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001222 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1223 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001224 if (ret) {
1225 RETURN_ON_ERROR(ret);
1226 RETURN_SUCCESS;
1227 }
1228 ctx->u.rep->count = ctx->count-1;
1229 state->ptr = ctx->ptr;
1230 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001231 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001232
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001233 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001234 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001236 /* we may have enough matches, but if we can
1237 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001238 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001239 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001240 MARK_PUSH(ctx->lastmark);
1241 /* zero-width match protection */
1242 DATA_PUSH(&ctx->u.rep->last_ptr);
1243 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001244 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1245 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 DATA_POP(&ctx->u.rep->last_ptr);
1247 if (ret) {
1248 MARK_POP_DISCARD(ctx->lastmark);
1249 RETURN_ON_ERROR(ret);
1250 RETURN_SUCCESS;
1251 }
1252 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001253 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001254 ctx->u.rep->count = ctx->count-1;
1255 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001256 }
1257
1258 /* cannot match more repeated items here. make sure the
1259 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001260 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001261 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001262 RETURN_ON_SUCCESS(ret);
1263 state->repeat = ctx->u.rep;
1264 state->ptr = ctx->ptr;
1265 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001266
1267 case SRE_OP_MIN_UNTIL:
1268 /* minimizing repeat */
1269 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1270
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001271 ctx->u.rep = state->repeat;
1272 if (!ctx->u.rep)
1273 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001274
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001275 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001276
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001277 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001278
Serhiy Storchakacf29ba82013-09-05 18:02:57 +03001279 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001280 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001281
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001282 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001283 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001285 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1286 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 if (ret) {
1288 RETURN_ON_ERROR(ret);
1289 RETURN_SUCCESS;
1290 }
1291 ctx->u.rep->count = ctx->count-1;
1292 state->ptr = ctx->ptr;
1293 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001294 }
1295
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001296 LASTMARK_SAVE();
1297
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001298 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001299 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001300 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001301 if (ret) {
1302 RETURN_ON_ERROR(ret);
1303 RETURN_SUCCESS;
1304 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001305
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001306 state->repeat = ctx->u.rep;
1307 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001308
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001309 LASTMARK_RESTORE();
1310
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001311 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +02001312 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1313 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001314 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001315
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001316 ctx->u.rep->count = ctx->count;
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +02001317 /* zero-width match protection */
1318 DATA_PUSH(&ctx->u.rep->last_ptr);
1319 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001320 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1321 ctx->u.rep->pattern+3);
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +02001322 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001323 if (ret) {
1324 RETURN_ON_ERROR(ret);
1325 RETURN_SUCCESS;
1326 }
1327 ctx->u.rep->count = ctx->count-1;
1328 state->ptr = ctx->ptr;
1329 RETURN_FAILURE;
1330
1331 case SRE_OP_GROUPREF:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1334 ctx->ptr, ctx->pattern[0]));
1335 i = ctx->pattern[0];
1336 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001337 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001338 if (groupref >= state->lastmark) {
1339 RETURN_FAILURE;
1340 } else {
1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 if (!p || !e || e < p)
1344 RETURN_FAILURE;
1345 while (p < e) {
1346 if (ctx->ptr >= end || *ctx->ptr != *p)
1347 RETURN_FAILURE;
1348 p++; ctx->ptr++;
1349 }
1350 }
1351 }
1352 ctx->pattern++;
1353 break;
1354
1355 case SRE_OP_GROUPREF_IGNORE:
1356 /* match backreference */
1357 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1358 ctx->ptr, ctx->pattern[0]));
1359 i = ctx->pattern[0];
1360 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001361 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001362 if (groupref >= state->lastmark) {
1363 RETURN_FAILURE;
1364 } else {
1365 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1366 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1367 if (!p || !e || e < p)
1368 RETURN_FAILURE;
1369 while (p < e) {
1370 if (ctx->ptr >= end ||
1371 state->lower(*ctx->ptr) != state->lower(*p))
1372 RETURN_FAILURE;
1373 p++; ctx->ptr++;
1374 }
1375 }
1376 }
1377 ctx->pattern++;
1378 break;
1379
1380 case SRE_OP_GROUPREF_EXISTS:
1381 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[0]));
1383 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384 i = ctx->pattern[0];
1385 {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001386 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001387 if (groupref >= state->lastmark) {
1388 ctx->pattern += ctx->pattern[1];
1389 break;
1390 } else {
1391 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1392 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1393 if (!p || !e || e < p) {
1394 ctx->pattern += ctx->pattern[1];
1395 break;
1396 }
1397 }
1398 }
1399 ctx->pattern += 2;
1400 break;
1401
1402 case SRE_OP_ASSERT:
1403 /* assert subpattern */
1404 /* <ASSERT> <skip> <back> <pattern> */
1405 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1406 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001407 if (ctx->ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001408 RETURN_FAILURE;
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001409 state->ptr = ctx->ptr - ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001410 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001411 RETURN_ON_FAILURE(ret);
1412 ctx->pattern += ctx->pattern[0];
1413 break;
1414
1415 case SRE_OP_ASSERT_NOT:
1416 /* assert not subpattern */
1417 /* <ASSERT_NOT> <skip> <back> <pattern> */
1418 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1419 ctx->ptr, ctx->pattern[1]));
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001420 if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
1421 state->ptr = ctx->ptr - ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001422 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001423 if (ret) {
1424 RETURN_ON_ERROR(ret);
1425 RETURN_FAILURE;
1426 }
1427 }
1428 ctx->pattern += ctx->pattern[0];
1429 break;
1430
1431 case SRE_OP_FAILURE:
1432 /* immediate failure */
1433 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1434 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001435
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001436 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001437 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1438 ctx->pattern[-1]));
1439 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001440 }
1441 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001442
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001443exit:
1444 ctx_pos = ctx->last_ctx_pos;
1445 jump = ctx->jump;
1446 DATA_POP_DISCARD(ctx);
1447 if (ctx_pos == -1)
1448 return ret;
1449 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1450
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001451 switch (jump) {
1452 case JUMP_MAX_UNTIL_2:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_2;
1455 case JUMP_MAX_UNTIL_3:
1456 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457 goto jump_max_until_3;
1458 case JUMP_MIN_UNTIL_2:
1459 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1460 goto jump_min_until_2;
1461 case JUMP_MIN_UNTIL_3:
1462 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1463 goto jump_min_until_3;
1464 case JUMP_BRANCH:
1465 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1466 goto jump_branch;
1467 case JUMP_MAX_UNTIL_1:
1468 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1469 goto jump_max_until_1;
1470 case JUMP_MIN_UNTIL_1:
1471 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1472 goto jump_min_until_1;
1473 case JUMP_REPEAT:
1474 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1475 goto jump_repeat;
1476 case JUMP_REPEAT_ONE_1:
1477 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1478 goto jump_repeat_one_1;
1479 case JUMP_REPEAT_ONE_2:
1480 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1481 goto jump_repeat_one_2;
1482 case JUMP_MIN_REPEAT_ONE:
1483 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1484 goto jump_min_repeat_one;
1485 case JUMP_ASSERT:
1486 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1487 goto jump_assert;
1488 case JUMP_ASSERT_NOT:
1489 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1490 goto jump_assert_not;
1491 case JUMP_NONE:
Serhiy Storchakacf29ba82013-09-05 18:02:57 +03001492 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1493 ctx->ptr, ret));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001494 break;
1495 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001496
1497 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001498}
1499
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001500LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001501SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1502{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001503 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1504 SRE_CHAR* end = (SRE_CHAR *)state->end;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001505 Py_ssize_t status = 0;
1506 Py_ssize_t prefix_len = 0;
1507 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001508 SRE_CODE* prefix = NULL;
1509 SRE_CODE* charset = NULL;
1510 SRE_CODE* overlap = NULL;
1511 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001512
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001513 if (ptr > end)
1514 return 0;
1515
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001516 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001517 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001518 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001519
1520 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001521
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001522 if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
1523 TRACE(("reject (got %u chars, need %u)\n",
1524 (unsigned int)(end - ptr), pattern[3]));
1525 return 0;
1526 }
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001527 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001528 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001529 character in there, so literal search will work) */
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001530 end -= pattern[3]-1;
1531 if (end <= ptr)
1532 end = ptr+1;
1533 }
1534
Fredrik Lundh3562f112000-07-02 12:00:07 +00001535 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001536 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001537 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001538 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001539 prefix_skip = pattern[6];
1540 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001541 overlap = prefix + prefix_len - 1;
1542 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001543 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001544 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001545 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001546
1547 pattern += 1 + pattern[1];
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001548 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001549
Serhiy Storchakacf29ba82013-09-05 18:02:57 +03001550 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1551 prefix, prefix_len, prefix_skip));
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001552 TRACE(("charset = %p\n", charset));
1553
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001554#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001555 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001556 /* pattern starts with a known prefix. use the overlap
1557 table to skip forward as fast as we possibly can */
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001558 Py_ssize_t i = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001559 end = (SRE_CHAR *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001560 while (ptr < end) {
1561 for (;;) {
Fredrik Lundh0640e112000-06-30 13:55:15 +00001562 if ((SRE_CODE) ptr[0] != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001563 if (!i)
1564 break;
1565 else
1566 i = overlap[i];
1567 } else {
1568 if (++i == prefix_len) {
1569 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001570 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1571 state->start = ptr + 1 - prefix_len;
1572 state->ptr = ptr + 1 - prefix_len + prefix_skip;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001573 if (flags & SRE_INFO_LITERAL)
1574 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001575 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001576 if (status != 0)
1577 return status;
1578 /* close but no cigar -- try again */
1579 i = overlap[i];
1580 }
1581 break;
1582 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001583 }
1584 ptr++;
1585 }
1586 return 0;
1587 }
1588#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001589
Fredrik Lundh3562f112000-07-02 12:00:07 +00001590 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001592 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001593 SRE_CODE chr = pattern[1];
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001594 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001595 for (;;) {
1596 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1597 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001598 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001599 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001600 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001601 state->start = ptr;
1602 state->ptr = ++ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001603 if (flags & SRE_INFO_LITERAL)
1604 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001605 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001606 if (status != 0)
1607 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001608 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001609 } else if (charset) {
1610 /* pattern starts with a character from a known set */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001611 end = (SRE_CHAR *)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001612 for (;;) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001613 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001614 ptr++;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001615 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001616 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001617 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001618 state->start = ptr;
1619 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001620 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001621 if (status != 0)
1622 break;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001623 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001624 }
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001625 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001626 /* general case */
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001627 assert(ptr <= end);
1628 while (1) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001629 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001630 state->start = state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001631 status = SRE_MATCH(state, pattern);
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001632 if (status != 0 || ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001633 break;
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001634 ptr++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001635 }
Serhiy Storchaka7865f212015-07-06 13:58:24 +03001636 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001637
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001638 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001639}
Tim Peters3d563502006-01-21 02:47:53 +00001640
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001641LOCAL(int)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001642SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001643{
1644 /* check if given string is a literal template (i.e. no escapes) */
1645 while (len-- > 0)
1646 if (*ptr++ == '\\')
1647 return 0;
1648 return 1;
1649}
Guido van Rossumb700df92000-03-31 14:59:30 +00001650
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001651#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001652
1653/* -------------------------------------------------------------------- */
1654/* factories and destructors */
1655
1656/* see sre.h for object declarations */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00001657static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1658static PyObject*pattern_scanner(PatternObject*, PyObject*);
Guido van Rossumb700df92000-03-31 14:59:30 +00001659
1660static PyObject *
Georg Brandl964f5972006-05-28 22:38:57 +00001661sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001662{
Benjamin Peterson9dccb012013-01-10 10:37:47 -06001663 return PyInt_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001664}
1665
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001666static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001667sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001668{
1669 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001670 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001671 return NULL;
1672 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001673 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001674 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001675#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001676 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001677#else
1678 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001679#endif
Fredrik Lundhb389df32000-06-29 12:48:37 +00001680 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001681}
1682
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001683LOCAL(void)
1684state_reset(SRE_STATE* state)
1685{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001686 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001687 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001688
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001689 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001690 state->lastindex = -1;
1691
1692 state->repeat = NULL;
1693
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001694 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001695}
1696
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001697static void*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001698getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
Guido van Rossumb700df92000-03-31 14:59:30 +00001699{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001700 /* given a python object, return a data pointer, a length (in
1701 characters), and a character size. return NULL if the object
1702 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001703
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001704 PyBufferProcs *buffer;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001705 Py_ssize_t size, bytes;
1706 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001707 void* ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001708
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001709#if defined(HAVE_UNICODE)
1710 if (PyUnicode_Check(string)) {
1711 /* unicode strings doesn't always support the buffer interface */
1712 ptr = (void*) PyUnicode_AS_DATA(string);
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00001713 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001714 size = PyUnicode_GET_SIZE(string);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001715 charsize = sizeof(Py_UNICODE);
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001716
1717 } else {
1718#endif
1719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001720 /* get pointer to string buffer */
Christian Heimese93237d2007-12-19 02:37:44 +00001721 buffer = Py_TYPE(string)->tp_as_buffer;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001722 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1723 buffer->bf_getsegcount(string, NULL) != 1) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001724 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001725 return NULL;
1726 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001727
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001728 /* determine buffer size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001729 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1730 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001731 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1732 return NULL;
1733 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001734
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001735 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001736#if PY_VERSION_HEX >= 0x01060000
1737 size = PyObject_Size(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001738#else
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001739 size = PyObject_Length(string);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001740#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00001741
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001742 if (PyString_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001743 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001744#if defined(HAVE_UNICODE)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001745 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001746 charsize = sizeof(Py_UNICODE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001747#endif
1748 else {
1749 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1750 return NULL;
1751 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001752
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001753#if defined(HAVE_UNICODE)
1754 }
1755#endif
1756
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001757 *p_length = size;
1758 *p_charsize = charsize;
1759
1760 return ptr;
1761}
1762
1763LOCAL(PyObject*)
1764state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001765 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766{
1767 /* prepare state object */
1768
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001769 Py_ssize_t length;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001770 int charsize;
1771 void* ptr;
1772
1773 memset(state, 0, sizeof(SRE_STATE));
1774
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001775 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001776 state->lastindex = -1;
1777
1778 ptr = getstring(string, &length, &charsize);
1779 if (!ptr)
1780 return NULL;
1781
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001782 /* adjust boundaries */
1783 if (start < 0)
1784 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001785 else if (start > length)
1786 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001787
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001788 if (end < 0)
1789 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001790 else if (end > length)
1791 end = length;
1792
1793 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001794
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001795 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001796
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001797 state->start = (void*) ((char*) ptr + start * state->charsize);
1798 state->end = (void*) ((char*) ptr + end * state->charsize);
1799
1800 Py_INCREF(string);
1801 state->string = string;
1802 state->pos = start;
1803 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001804
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001805 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001806 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001807 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001808#if defined(HAVE_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001809 state->lower = sre_lower_unicode;
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00001810#else
1811 state->lower = sre_lower_locale;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001812#endif
1813 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001814 state->lower = sre_lower;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001815
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001816 return string;
Guido van Rossumb700df92000-03-31 14:59:30 +00001817}
1818
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001819LOCAL(void)
1820state_fini(SRE_STATE* state)
1821{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001822 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001823 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001824}
1825
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001826/* calculate offset from start of string */
1827#define STATE_OFFSET(state, member)\
1828 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1829
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001830LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001831state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001832{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001833 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001834
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001835 index = (index - 1) * 2;
1836
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001837 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001838 if (empty)
1839 /* want empty string */
1840 i = j = 0;
1841 else {
1842 Py_INCREF(Py_None);
1843 return Py_None;
1844 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001845 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001846 i = STATE_OFFSET(state, state->mark[index]);
1847 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001848 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001849
Fredrik Lundh58100642000-08-09 09:14:35 +00001850 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001851}
1852
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001853static void
1854pattern_error(int status)
1855{
1856 switch (status) {
1857 case SRE_ERROR_RECURSION_LIMIT:
1858 PyErr_SetString(
1859 PyExc_RuntimeError,
1860 "maximum recursion limit exceeded"
1861 );
1862 break;
1863 case SRE_ERROR_MEMORY:
1864 PyErr_NoMemory();
1865 break;
Facundo Batista4473d222008-01-08 21:10:12 +00001866 case SRE_ERROR_INTERRUPTED:
1867 /* An exception has already been raised, so let it fly */
1868 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001869 default:
1870 /* other error codes indicate compiler/engine bugs */
1871 PyErr_SetString(
1872 PyExc_RuntimeError,
1873 "internal error in regular expression engine"
1874 );
1875 }
1876}
1877
Guido van Rossumb700df92000-03-31 14:59:30 +00001878static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001879pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001880{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001881 if (self->weakreflist != NULL)
1882 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 Py_XDECREF(self->pattern);
1884 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001885 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001886 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001887}
1888
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001889static int
1890check_args_size(const char *name, PyObject* args, PyObject* kw, int n)
1891{
1892 Py_ssize_t m = PyTuple_GET_SIZE(args) + (kw ? PyDict_Size(kw) : 0);
1893 if (m <= n)
1894 return 1;
1895 PyErr_Format(PyExc_TypeError,
1896 "%s() takes at most %d positional arguments (%zd given)",
1897 name, n, m);
1898 return 0;
1899}
1900
1901static PyObject*
1902fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
1903{
1904 if (string2 != NULL) {
1905 char buf[100];
1906 if (string != NULL) {
1907 PyErr_Format(PyExc_TypeError,
1908 "Argument given by name ('%s') and position (1)",
1909 oldname);
1910 return NULL;
1911 }
1912 sprintf(buf, "The '%s' keyword parameter name is deprecated. "
1913 "Use 'string' instead.", oldname);
1914 if (PyErr_Warn(PyExc_DeprecationWarning, buf) < 0)
1915 return NULL;
1916 return string2;
1917 }
1918 if (string == NULL) {
1919 PyErr_SetString(PyExc_TypeError,
1920 "Required argument 'string' (pos 1) not found");
1921 return NULL;
1922 }
1923 return string;
1924}
1925
Guido van Rossumb700df92000-03-31 14:59:30 +00001926static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001927pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001928{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001929 SRE_STATE state;
1930 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001931
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001932 PyObject *string = NULL, *string2 = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001933 Py_ssize_t start = 0;
1934 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001935 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1936 if (!check_args_size("match", args, kw, 3))
1937 return NULL;
1938
1939 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:match", kwlist,
1940 &string, &start, &end, &string2))
1941 return NULL;
1942
1943 string = fix_string_param(string, string2, "pattern");
1944 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001946
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001947 string = state_init(&state, self, string, start, end);
1948 if (!string)
1949 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001950
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001951 state.ptr = state.start;
1952
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001953 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1954
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001955 if (state.charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001956 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001957 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001958#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001959 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00001960#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001961 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001962
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001963 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00001964 if (PyErr_Occurred())
1965 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001966
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001967 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001968
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001969 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001970}
1971
1972static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001973pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001974{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001975 SRE_STATE state;
1976 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001977
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001978 PyObject *string = NULL, *string2 = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00001979 Py_ssize_t start = 0;
1980 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001981 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
1982 if (!check_args_size("search", args, kw, 3))
1983 return NULL;
1984
1985 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:search", kwlist,
1986 &string, &start, &end, &string2))
1987 return NULL;
1988
1989 string = fix_string_param(string, string2, "pattern");
1990 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001991 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001992
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001993 string = state_init(&state, self, string, start, end);
1994 if (!string)
1995 return NULL;
1996
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001997 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1998
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001999 if (state.charsize == 1) {
2000 status = sre_search(&state, PatternObject_GetCode(self));
2001 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002002#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002003 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002004#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002006
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00002007 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2008
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002009 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00002010
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002011 if (PyErr_Occurred())
2012 return NULL;
2013
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002014 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00002015}
2016
2017static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002018call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002019{
2020 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002021 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002022 PyObject* func;
2023 PyObject* result;
2024
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002025 if (!args)
2026 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002027 name = PyString_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002028 if (!name)
2029 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002030 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002031 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002032 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002033 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002034 func = PyObject_GetAttrString(mod, function);
2035 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002036 if (!func)
2037 return NULL;
2038 result = PyObject_CallObject(func, args);
2039 Py_DECREF(func);
2040 Py_DECREF(args);
2041 return result;
2042}
2043
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002044#ifdef USE_BUILTIN_COPY
2045static int
2046deepcopy(PyObject** object, PyObject* memo)
2047{
2048 PyObject* copy;
2049
2050 copy = call(
2051 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002052 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002053 );
2054 if (!copy)
2055 return 0;
2056
Serhiy Storchaka763a61c2016-04-10 18:05:12 +03002057 Py_SETREF(*object, copy);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002058
2059 return 1; /* success */
2060}
2061#endif
2062
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002063static PyObject*
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002064join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002065{
2066 /* join list elements */
2067
2068 PyObject* joiner;
2069#if PY_VERSION_HEX >= 0x01060000
2070 PyObject* function;
2071 PyObject* args;
2072#endif
2073 PyObject* result;
2074
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002075 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002076 if (!joiner)
2077 return NULL;
2078
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002079 if (PyList_GET_SIZE(list) == 0) {
2080 Py_DECREF(list);
2081 return joiner;
2082 }
2083
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002084#if PY_VERSION_HEX >= 0x01060000
2085 function = PyObject_GetAttrString(joiner, "join");
2086 if (!function) {
2087 Py_DECREF(joiner);
2088 return NULL;
2089 }
2090 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002091 if (!args) {
2092 Py_DECREF(function);
2093 Py_DECREF(joiner);
2094 return NULL;
2095 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002096 PyTuple_SET_ITEM(args, 0, list);
2097 result = PyObject_CallObject(function, args);
2098 Py_DECREF(args); /* also removes list */
2099 Py_DECREF(function);
2100#else
2101 result = call(
2102 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002103 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002104 );
2105#endif
2106 Py_DECREF(joiner);
2107
2108 return result;
2109}
2110
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002111static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002112pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002113{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002114 SRE_STATE state;
2115 PyObject* list;
2116 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002117 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02002119 PyObject *string = NULL, *string2 = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002120 Py_ssize_t start = 0;
2121 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02002122 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2123 if (!check_args_size("findall", args, kw, 3))
2124 return NULL;
2125
2126 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnnO:findall", kwlist,
2127 &string, &start, &end, &string2))
2128 return NULL;
2129
2130 string = fix_string_param(string, string2, "source");
2131 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002132 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002133
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002134 string = state_init(&state, self, string, start, end);
2135 if (!string)
2136 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002137
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002138 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002139 if (!list) {
2140 state_fini(&state);
2141 return NULL;
2142 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002143
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002144 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002145
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002146 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002147
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002148 state_reset(&state);
2149
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002150 state.ptr = state.start;
2151
2152 if (state.charsize == 1) {
2153 status = sre_search(&state, PatternObject_GetCode(self));
2154 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002155#if defined(HAVE_UNICODE)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002156 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002157#endif
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002158 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002159
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002160 if (PyErr_Occurred())
2161 goto error;
2162
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002163 if (status <= 0) {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00002164 if (status == 0)
2165 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002166 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002167 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002168 }
Tim Peters3d563502006-01-21 02:47:53 +00002169
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002170 /* don't bother to build a match object */
2171 switch (self->groups) {
2172 case 0:
2173 b = STATE_OFFSET(&state, state.start);
2174 e = STATE_OFFSET(&state, state.ptr);
2175 item = PySequence_GetSlice(string, b, e);
2176 if (!item)
2177 goto error;
2178 break;
2179 case 1:
2180 item = state_getslice(&state, 1, string, 1);
2181 if (!item)
2182 goto error;
2183 break;
2184 default:
2185 item = PyTuple_New(self->groups);
2186 if (!item)
2187 goto error;
2188 for (i = 0; i < self->groups; i++) {
2189 PyObject* o = state_getslice(&state, i+1, string, 1);
2190 if (!o) {
2191 Py_DECREF(item);
2192 goto error;
2193 }
2194 PyTuple_SET_ITEM(item, i, o);
2195 }
2196 break;
2197 }
2198
2199 status = PyList_Append(list, item);
2200 Py_DECREF(item);
2201 if (status < 0)
2202 goto error;
2203
2204 if (state.ptr == state.start)
2205 state.start = (void*) ((char*) state.ptr + state.charsize);
2206 else
2207 state.start = state.ptr;
2208
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002209 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002210
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002211 state_fini(&state);
2212 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002213
2214error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002215 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002216 state_fini(&state);
2217 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002218
Guido van Rossumb700df92000-03-31 14:59:30 +00002219}
2220
Fredrik Lundh703ce812001-10-24 22:16:30 +00002221#if PY_VERSION_HEX >= 0x02020000
2222static PyObject*
2223pattern_finditer(PatternObject* pattern, PyObject* args)
2224{
2225 PyObject* scanner;
2226 PyObject* search;
2227 PyObject* iterator;
2228
2229 scanner = pattern_scanner(pattern, args);
2230 if (!scanner)
2231 return NULL;
2232
2233 search = PyObject_GetAttrString(scanner, "search");
2234 Py_DECREF(scanner);
2235 if (!search)
2236 return NULL;
2237
2238 iterator = PyCallIter_New(search, Py_None);
2239 Py_DECREF(search);
2240
2241 return iterator;
2242}
2243#endif
2244
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002245static PyObject*
2246pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2247{
2248 SRE_STATE state;
2249 PyObject* list;
2250 PyObject* item;
2251 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002252 Py_ssize_t n;
2253 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002254 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002255
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02002256 PyObject *string = NULL, *string2 = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002257 Py_ssize_t maxsplit = 0;
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02002258 static char* kwlist[] = { "string", "maxsplit", "source", NULL };
2259 if (!check_args_size("split", args, kw, 2))
2260 return NULL;
2261
2262 if (!PyArg_ParseTupleAndKeywords(args, kw, "|OnO:split", kwlist,
2263 &string, &maxsplit, &string2))
2264 return NULL;
2265
2266 string = fix_string_param(string, string2, "source");
2267 if (!string)
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002268 return NULL;
2269
Serhiy Storchaka955b6762017-05-18 12:34:40 +03002270 if (Py_Py3kWarningFlag &&
2271 (self->code[0] != SRE_OP_INFO || self->code[3] == 0))
2272 {
2273 if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
2274 if (PyErr_WarnPy3k("split() requires a non-empty pattern match.",
2275 1) < 0)
2276 return NULL;
2277 }
2278 else if (PyErr_WarnEx(PyExc_FutureWarning,
2279 "split() requires a non-empty pattern match.",
2280 1) < 0)
2281 return NULL;
2282 }
2283
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002284 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002285 if (!string)
2286 return NULL;
2287
2288 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002289 if (!list) {
2290 state_fini(&state);
2291 return NULL;
2292 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002293
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002294 n = 0;
2295 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002296
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002297 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002298
2299 state_reset(&state);
2300
2301 state.ptr = state.start;
2302
2303 if (state.charsize == 1) {
2304 status = sre_search(&state, PatternObject_GetCode(self));
2305 } else {
2306#if defined(HAVE_UNICODE)
2307 status = sre_usearch(&state, PatternObject_GetCode(self));
2308#endif
2309 }
2310
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002311 if (PyErr_Occurred())
2312 goto error;
2313
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002314 if (status <= 0) {
2315 if (status == 0)
2316 break;
2317 pattern_error(status);
2318 goto error;
2319 }
Tim Peters3d563502006-01-21 02:47:53 +00002320
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002321 if (state.start == state.ptr) {
Serhiy Storchaka7865f212015-07-06 13:58:24 +03002322 if (last == state.end || state.ptr == state.end)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002323 break;
2324 /* skip one character */
2325 state.start = (void*) ((char*) state.ptr + state.charsize);
2326 continue;
2327 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002328
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002329 /* get segment before this match */
2330 item = PySequence_GetSlice(
2331 string, STATE_OFFSET(&state, last),
2332 STATE_OFFSET(&state, state.start)
2333 );
2334 if (!item)
2335 goto error;
2336 status = PyList_Append(list, item);
2337 Py_DECREF(item);
2338 if (status < 0)
2339 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002340
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002341 /* add groups (if any) */
2342 for (i = 0; i < self->groups; i++) {
2343 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002344 if (!item)
2345 goto error;
2346 status = PyList_Append(list, item);
2347 Py_DECREF(item);
2348 if (status < 0)
2349 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002350 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002351
2352 n = n + 1;
2353
2354 last = state.start = state.ptr;
2355
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002356 }
2357
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002358 /* get segment following last match (even if empty) */
2359 item = PySequence_GetSlice(
2360 string, STATE_OFFSET(&state, last), state.endpos
2361 );
2362 if (!item)
2363 goto error;
2364 status = PyList_Append(list, item);
2365 Py_DECREF(item);
2366 if (status < 0)
2367 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002368
2369 state_fini(&state);
2370 return list;
2371
2372error:
2373 Py_DECREF(list);
2374 state_fini(&state);
2375 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002376
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002377}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002378
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002379static PyObject*
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002380pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002381 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002382{
2383 SRE_STATE state;
2384 PyObject* list;
2385 PyObject* item;
2386 PyObject* filter;
2387 PyObject* args;
2388 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002389 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002390 int status;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002391 Py_ssize_t n;
2392 Py_ssize_t i, b, e;
2393 int bint;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002394 int filter_is_callable;
2395
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002396 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002397 /* sub/subn takes either a function or a template */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002398 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002399 Py_INCREF(filter);
2400 filter_is_callable = 1;
2401 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002402 /* if not callable, check if it's a literal string */
2403 int literal;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002404 ptr = getstring(ptemplate, &n, &bint);
2405 b = bint;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002406 if (ptr) {
2407 if (b == 1) {
Skip Montanaro816a1622006-04-18 11:53:09 +00002408 literal = sre_literal_template((unsigned char *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002409 } else {
2410#if defined(HAVE_UNICODE)
Skip Montanaro816a1622006-04-18 11:53:09 +00002411 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002412#endif
2413 }
2414 } else {
2415 PyErr_Clear();
2416 literal = 0;
2417 }
2418 if (literal) {
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002419 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002420 Py_INCREF(filter);
2421 filter_is_callable = 0;
2422 } else {
2423 /* not a literal; hand it over to the template compiler */
2424 filter = call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00002425 SRE_PY_MODULE, "_subx",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002426 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002427 );
2428 if (!filter)
2429 return NULL;
2430 filter_is_callable = PyCallable_Check(filter);
2431 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002432 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002433
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002434 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002435 if (!string) {
2436 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002437 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002438 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002439
2440 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002441 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002442 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002443 state_fini(&state);
2444 return NULL;
2445 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002446
2447 n = i = 0;
2448
2449 while (!count || n < count) {
2450
2451 state_reset(&state);
2452
2453 state.ptr = state.start;
2454
2455 if (state.charsize == 1) {
2456 status = sre_search(&state, PatternObject_GetCode(self));
2457 } else {
2458#if defined(HAVE_UNICODE)
2459 status = sre_usearch(&state, PatternObject_GetCode(self));
2460#endif
2461 }
2462
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00002463 if (PyErr_Occurred())
2464 goto error;
2465
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002466 if (status <= 0) {
2467 if (status == 0)
2468 break;
2469 pattern_error(status);
2470 goto error;
2471 }
Tim Peters3d563502006-01-21 02:47:53 +00002472
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002473 b = STATE_OFFSET(&state, state.start);
2474 e = STATE_OFFSET(&state, state.ptr);
2475
2476 if (i < b) {
2477 /* get segment before this match */
2478 item = PySequence_GetSlice(string, i, b);
2479 if (!item)
2480 goto error;
2481 status = PyList_Append(list, item);
2482 Py_DECREF(item);
2483 if (status < 0)
2484 goto error;
2485
2486 } else if (i == b && i == e && n > 0)
2487 /* ignore empty match on latest position */
2488 goto next;
2489
2490 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002491 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002492 match = pattern_new_match(self, &state, 1);
2493 if (!match)
2494 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002495 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002496 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002497 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002498 goto error;
2499 }
2500 item = PyObject_CallObject(filter, args);
2501 Py_DECREF(args);
2502 Py_DECREF(match);
2503 if (!item)
2504 goto error;
2505 } else {
2506 /* filter is literal string */
2507 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002508 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002509 }
2510
2511 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002512 if (item != Py_None) {
2513 status = PyList_Append(list, item);
2514 Py_DECREF(item);
2515 if (status < 0)
2516 goto error;
2517 }
Tim Peters3d563502006-01-21 02:47:53 +00002518
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002519 i = e;
2520 n = n + 1;
2521
2522next:
2523 /* move on */
Serhiy Storchaka7865f212015-07-06 13:58:24 +03002524 if (state.ptr == state.end)
2525 break;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002526 if (state.ptr == state.start)
2527 state.start = (void*) ((char*) state.ptr + state.charsize);
2528 else
2529 state.start = state.ptr;
2530
2531 }
2532
2533 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002534 if (i < state.endpos) {
2535 item = PySequence_GetSlice(string, i, state.endpos);
2536 if (!item)
2537 goto error;
2538 status = PyList_Append(list, item);
2539 Py_DECREF(item);
2540 if (status < 0)
2541 goto error;
2542 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002543
2544 state_fini(&state);
2545
Guido van Rossum4e173842001-12-07 04:25:10 +00002546 Py_DECREF(filter);
2547
Fredrik Lundhdac58492001-10-21 21:48:30 +00002548 /* convert list to single string (also removes list) */
Guido van Rossum1ff91d92007-09-10 22:02:25 +00002549 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002550
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002551 if (!item)
2552 return NULL;
2553
2554 if (subn)
Antoine Pitroub83575b2012-12-02 12:52:36 +01002555 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002556
2557 return item;
2558
2559error:
2560 Py_DECREF(list);
2561 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002562 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002563 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002564
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002565}
2566
2567static PyObject*
2568pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2569{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002570 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002571 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002572 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002573 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002574 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002575 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002576 return NULL;
2577
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002578 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002579}
2580
2581static PyObject*
2582pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2583{
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002584 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002585 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002586 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002587 static char* kwlist[] = { "repl", "string", "count", NULL };
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002588 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002589 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002590 return NULL;
2591
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002592 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002593}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002594
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002595static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00002596pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002597{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002598#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002599 PatternObject* copy;
2600 int offset;
2601
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002602 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2603 if (!copy)
2604 return NULL;
2605
2606 offset = offsetof(PatternObject, groups);
2607
2608 Py_XINCREF(self->groupindex);
2609 Py_XINCREF(self->indexgroup);
2610 Py_XINCREF(self->pattern);
2611
2612 memcpy((char*) copy + offset, (char*) self + offset,
2613 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002614 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002615
2616 return (PyObject*) copy;
2617#else
2618 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2619 return NULL;
2620#endif
2621}
2622
2623static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00002624pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002625{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002626#ifdef USE_BUILTIN_COPY
2627 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002628
Georg Brandlfbef5882006-05-28 22:14:04 +00002629 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002630 if (!copy)
2631 return NULL;
2632
2633 if (!deepcopy(&copy->groupindex, memo) ||
2634 !deepcopy(&copy->indexgroup, memo) ||
2635 !deepcopy(&copy->pattern, memo)) {
2636 Py_DECREF(copy);
2637 return NULL;
2638 }
2639
2640#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002641 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2642 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002643#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002644}
2645
Raymond Hettinger94478742004-09-24 04:31:19 +00002646PyDoc_STRVAR(pattern_match_doc,
2647"match(string[, pos[, endpos]]) --> match object or None.\n\
2648 Matches zero or more characters at the beginning of the string");
2649
2650PyDoc_STRVAR(pattern_search_doc,
2651"search(string[, pos[, endpos]]) --> match object or None.\n\
2652 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlovc08ded92012-12-25 18:50:03 +02002653 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002654
2655PyDoc_STRVAR(pattern_split_doc,
2656"split(string[, maxsplit = 0]) --> list.\n\
2657 Split string by the occurrences of pattern.");
2658
2659PyDoc_STRVAR(pattern_findall_doc,
2660"findall(string[, pos[, endpos]]) --> list.\n\
2661 Return a list of all non-overlapping matches of pattern in string.");
2662
2663PyDoc_STRVAR(pattern_finditer_doc,
2664"finditer(string[, pos[, endpos]]) --> iterator.\n\
2665 Return an iterator over all non-overlapping matches for the \n\
2666 RE pattern in string. For each match, the iterator returns a\n\
2667 match object.");
2668
2669PyDoc_STRVAR(pattern_sub_doc,
2670"sub(repl, string[, count = 0]) --> newstring\n\
2671 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002672 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002673
2674PyDoc_STRVAR(pattern_subn_doc,
2675"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2676 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2677 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002678 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002679
2680PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2681
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002682static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002683 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002684 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002685 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002686 pattern_search_doc},
2687 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2688 pattern_sub_doc},
2689 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2690 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002691 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002692 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002693 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Raymond Hettinger94478742004-09-24 04:31:19 +00002694 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002695#if PY_VERSION_HEX >= 0x02020000
Raymond Hettinger94478742004-09-24 04:31:19 +00002696 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2697 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002698#endif
Fredrik Lundh562586e2000-10-03 20:43:34 +00002699 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00002700 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2701 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002702 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002703};
2704
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002705#define PAT_OFF(x) offsetof(PatternObject, x)
2706static PyMemberDef pattern_members[] = {
2707 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2708 {"flags", T_INT, PAT_OFF(flags), READONLY},
2709 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2710 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2711 {NULL} /* Sentinel */
2712};
Guido van Rossumb700df92000-03-31 14:59:30 +00002713
2714statichere PyTypeObject Pattern_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002715 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00002716 0, "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002717 sizeof(PatternObject), sizeof(SRE_CODE),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002718 (destructor)pattern_dealloc, /*tp_dealloc*/
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002719 0, /* tp_print */
2720 0, /* tp_getattrn */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002721 0, /* tp_setattr */
2722 0, /* tp_compare */
2723 0, /* tp_repr */
2724 0, /* tp_as_number */
2725 0, /* tp_as_sequence */
2726 0, /* tp_as_mapping */
2727 0, /* tp_hash */
2728 0, /* tp_call */
2729 0, /* tp_str */
2730 0, /* tp_getattro */
2731 0, /* tp_setattro */
2732 0, /* tp_as_buffer */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002733 Py_TPFLAGS_DEFAULT, /* tp_flags */
Raymond Hettinger94478742004-09-24 04:31:19 +00002734 pattern_doc, /* tp_doc */
Raymond Hettinger027bb632004-05-31 03:09:25 +00002735 0, /* tp_traverse */
2736 0, /* tp_clear */
2737 0, /* tp_richcompare */
2738 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05002739 0, /* tp_iter */
2740 0, /* tp_iternext */
2741 pattern_methods, /* tp_methods */
2742 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002743};
2744
Guido van Rossum8b762f02008-08-05 03:39:21 +00002745static int _validate(PatternObject *self); /* Forward */
2746
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002747static PyObject *
2748_compile(PyObject* self_, PyObject* args)
2749{
2750 /* "compile" pattern descriptor to pattern object */
2751
2752 PatternObject* self;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002753 Py_ssize_t i, n;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002754
2755 PyObject* pattern;
2756 int flags = 0;
2757 PyObject* code;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002758 Py_ssize_t groups = 0;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002759 PyObject* groupindex = NULL;
2760 PyObject* indexgroup = NULL;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00002761 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002762 &PyList_Type, &code, &groups,
2763 &groupindex, &indexgroup))
2764 return NULL;
2765
2766 n = PyList_GET_SIZE(code);
Christian Heimes4956d2b2008-01-18 19:12:56 +00002767 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002768 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2769 if (!self)
2770 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002771 self->weakreflist = NULL;
2772 self->pattern = NULL;
2773 self->groupindex = NULL;
2774 self->indexgroup = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002775
2776 self->codesize = n;
2777
2778 for (i = 0; i < n; i++) {
2779 PyObject *o = PyList_GET_ITEM(code, i);
2780 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2781 : PyLong_AsUnsignedLong(o);
Antoine Pitroub83ea142012-11-20 22:30:42 +01002782 if (value == (unsigned long)-1 && PyErr_Occurred()) {
2783 if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2784 PyErr_SetString(PyExc_OverflowError,
2785 "regular expression code size limit exceeded");
2786 }
2787 break;
2788 }
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002789 self->code[i] = (SRE_CODE) value;
2790 if ((unsigned long) self->code[i] != value) {
2791 PyErr_SetString(PyExc_OverflowError,
2792 "regular expression code size limit exceeded");
2793 break;
2794 }
2795 }
2796
2797 if (PyErr_Occurred()) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00002798 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002799 return NULL;
2800 }
2801
2802 Py_INCREF(pattern);
2803 self->pattern = pattern;
2804
2805 self->flags = flags;
2806
2807 self->groups = groups;
2808
2809 Py_XINCREF(groupindex);
2810 self->groupindex = groupindex;
2811
2812 Py_XINCREF(indexgroup);
2813 self->indexgroup = indexgroup;
2814
2815 self->weakreflist = NULL;
2816
Guido van Rossum8b762f02008-08-05 03:39:21 +00002817 if (!_validate(self)) {
2818 Py_DECREF(self);
2819 return NULL;
2820 }
2821
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00002822 return (PyObject*) self;
2823}
2824
Guido van Rossumb700df92000-03-31 14:59:30 +00002825/* -------------------------------------------------------------------- */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002826/* Code validation */
2827
2828/* To learn more about this code, have a look at the _compile() function in
2829 Lib/sre_compile.py. The validation functions below checks the code array
2830 for conformance with the code patterns generated there.
2831
2832 The nice thing about the generated code is that it is position-independent:
2833 all jumps are relative jumps forward. Also, jumps don't cross each other:
2834 the target of a later jump is always earlier than the target of an earlier
2835 jump. IOW, this is okay:
2836
2837 J---------J-------T--------T
2838 \ \_____/ /
2839 \______________________/
2840
2841 but this is not:
2842
2843 J---------J-------T--------T
2844 \_________\_____/ /
2845 \____________/
2846
Serhiy Storchakafdb73ed2013-10-27 08:00:57 +02002847 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum8b762f02008-08-05 03:39:21 +00002848*/
2849
2850/* Defining this one enables tracing of the validator */
2851#undef VVERBOSE
2852
2853/* Trace macro for the validator */
2854#if defined(VVERBOSE)
2855#define VTRACE(v) printf v
2856#else
Senthil Kumarand5830682011-10-20 02:13:23 +08002857#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum8b762f02008-08-05 03:39:21 +00002858#endif
2859
2860/* Report failure */
2861#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2862
2863/* Extract opcode, argument, or skip count from code array */
2864#define GET_OP \
2865 do { \
2866 VTRACE(("%p: ", code)); \
2867 if (code >= end) FAIL; \
2868 op = *code++; \
2869 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2870 } while (0)
2871#define GET_ARG \
2872 do { \
2873 VTRACE(("%p= ", code)); \
2874 if (code >= end) FAIL; \
2875 arg = *code++; \
2876 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2877 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002878#define GET_SKIP_ADJ(adj) \
Guido van Rossum8b762f02008-08-05 03:39:21 +00002879 do { \
2880 VTRACE(("%p= ", code)); \
2881 if (code >= end) FAIL; \
2882 skip = *code; \
2883 VTRACE(("%lu (skip to %p)\n", \
2884 (unsigned long)skip, code+skip)); \
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03002885 if (skip-adj > end-code) \
Guido van Rossum8b762f02008-08-05 03:39:21 +00002886 FAIL; \
2887 code++; \
2888 } while (0)
Guido van Rossume3c4fd92008-09-10 14:27:00 +00002889#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002890
2891static int
2892_validate_charset(SRE_CODE *code, SRE_CODE *end)
2893{
2894 /* Some variables are manipulated by the macros above */
2895 SRE_CODE op;
2896 SRE_CODE arg;
2897 SRE_CODE offset;
2898 int i;
2899
2900 while (code < end) {
2901 GET_OP;
2902 switch (op) {
2903
2904 case SRE_OP_NEGATE:
2905 break;
2906
2907 case SRE_OP_LITERAL:
2908 GET_ARG;
2909 break;
2910
2911 case SRE_OP_RANGE:
2912 GET_ARG;
2913 GET_ARG;
2914 break;
2915
2916 case SRE_OP_CHARSET:
2917 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03002918 if (offset > end-code)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002919 FAIL;
2920 code += offset;
2921 break;
2922
2923 case SRE_OP_BIGCHARSET:
2924 GET_ARG; /* Number of blocks */
2925 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03002926 if (offset > end-code)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002927 FAIL;
2928 /* Make sure that each byte points to a valid block */
2929 for (i = 0; i < 256; i++) {
2930 if (((unsigned char *)code)[i] >= arg)
2931 FAIL;
2932 }
2933 code += offset;
2934 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03002935 if (offset > end-code)
Guido van Rossum8b762f02008-08-05 03:39:21 +00002936 FAIL;
2937 code += offset;
2938 break;
2939
2940 case SRE_OP_CATEGORY:
2941 GET_ARG;
2942 switch (arg) {
2943 case SRE_CATEGORY_DIGIT:
2944 case SRE_CATEGORY_NOT_DIGIT:
2945 case SRE_CATEGORY_SPACE:
2946 case SRE_CATEGORY_NOT_SPACE:
2947 case SRE_CATEGORY_WORD:
2948 case SRE_CATEGORY_NOT_WORD:
2949 case SRE_CATEGORY_LINEBREAK:
2950 case SRE_CATEGORY_NOT_LINEBREAK:
2951 case SRE_CATEGORY_LOC_WORD:
2952 case SRE_CATEGORY_LOC_NOT_WORD:
2953 case SRE_CATEGORY_UNI_DIGIT:
2954 case SRE_CATEGORY_UNI_NOT_DIGIT:
2955 case SRE_CATEGORY_UNI_SPACE:
2956 case SRE_CATEGORY_UNI_NOT_SPACE:
2957 case SRE_CATEGORY_UNI_WORD:
2958 case SRE_CATEGORY_UNI_NOT_WORD:
2959 case SRE_CATEGORY_UNI_LINEBREAK:
2960 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2961 break;
2962 default:
2963 FAIL;
2964 }
2965 break;
2966
2967 default:
2968 FAIL;
2969
2970 }
2971 }
2972
2973 return 1;
2974}
2975
2976static int
2977_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2978{
2979 /* Some variables are manipulated by the macros above */
2980 SRE_CODE op;
2981 SRE_CODE arg;
2982 SRE_CODE skip;
2983
2984 VTRACE(("code=%p, end=%p\n", code, end));
2985
2986 if (code > end)
2987 FAIL;
2988
2989 while (code < end) {
2990 GET_OP;
2991 switch (op) {
2992
2993 case SRE_OP_MARK:
2994 /* We don't check whether marks are properly nested; the
2995 sre_match() code is robust even if they don't, and the worst
2996 you can get is nonsensical match results. */
2997 GET_ARG;
2998 if (arg > 2*groups+1) {
2999 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
3000 FAIL;
3001 }
3002 break;
3003
3004 case SRE_OP_LITERAL:
3005 case SRE_OP_NOT_LITERAL:
3006 case SRE_OP_LITERAL_IGNORE:
3007 case SRE_OP_NOT_LITERAL_IGNORE:
3008 GET_ARG;
3009 /* The arg is just a character, nothing to check */
3010 break;
3011
3012 case SRE_OP_SUCCESS:
3013 case SRE_OP_FAILURE:
3014 /* Nothing to check; these normally end the matching process */
3015 break;
3016
3017 case SRE_OP_AT:
3018 GET_ARG;
3019 switch (arg) {
3020 case SRE_AT_BEGINNING:
3021 case SRE_AT_BEGINNING_STRING:
3022 case SRE_AT_BEGINNING_LINE:
3023 case SRE_AT_END:
3024 case SRE_AT_END_LINE:
3025 case SRE_AT_END_STRING:
3026 case SRE_AT_BOUNDARY:
3027 case SRE_AT_NON_BOUNDARY:
3028 case SRE_AT_LOC_BOUNDARY:
3029 case SRE_AT_LOC_NON_BOUNDARY:
3030 case SRE_AT_UNI_BOUNDARY:
3031 case SRE_AT_UNI_NON_BOUNDARY:
3032 break;
3033 default:
3034 FAIL;
3035 }
3036 break;
3037
3038 case SRE_OP_ANY:
3039 case SRE_OP_ANY_ALL:
3040 /* These have no operands */
3041 break;
3042
3043 case SRE_OP_IN:
3044 case SRE_OP_IN_IGNORE:
3045 GET_SKIP;
3046 /* Stop 1 before the end; we check the FAILURE below */
3047 if (!_validate_charset(code, code+skip-2))
3048 FAIL;
3049 if (code[skip-2] != SRE_OP_FAILURE)
3050 FAIL;
3051 code += skip-1;
3052 break;
3053
3054 case SRE_OP_INFO:
3055 {
3056 /* A minimal info field is
3057 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
3058 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
3059 more follows. */
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00003060 SRE_CODE flags, i;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003061 SRE_CODE *newcode;
3062 GET_SKIP;
3063 newcode = code+skip-1;
3064 GET_ARG; flags = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00003065 GET_ARG; /* min */
3066 GET_ARG; /* max */
Guido van Rossum8b762f02008-08-05 03:39:21 +00003067 /* Check that only valid flags are present */
3068 if ((flags & ~(SRE_INFO_PREFIX |
3069 SRE_INFO_LITERAL |
3070 SRE_INFO_CHARSET)) != 0)
3071 FAIL;
3072 /* PREFIX and CHARSET are mutually exclusive */
3073 if ((flags & SRE_INFO_PREFIX) &&
3074 (flags & SRE_INFO_CHARSET))
3075 FAIL;
3076 /* LITERAL implies PREFIX */
3077 if ((flags & SRE_INFO_LITERAL) &&
3078 !(flags & SRE_INFO_PREFIX))
3079 FAIL;
3080 /* Validate the prefix */
3081 if (flags & SRE_INFO_PREFIX) {
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00003082 SRE_CODE prefix_len;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003083 GET_ARG; prefix_len = arg;
Brett Cannon8ffe7bb2010-05-03 23:51:28 +00003084 GET_ARG; /* prefix skip */
Guido van Rossum8b762f02008-08-05 03:39:21 +00003085 /* Here comes the prefix string */
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03003086 if (prefix_len > newcode-code)
Guido van Rossum8b762f02008-08-05 03:39:21 +00003087 FAIL;
3088 code += prefix_len;
3089 /* And here comes the overlap table */
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03003090 if (prefix_len > newcode-code)
Guido van Rossum8b762f02008-08-05 03:39:21 +00003091 FAIL;
3092 /* Each overlap value should be < prefix_len */
3093 for (i = 0; i < prefix_len; i++) {
3094 if (code[i] >= prefix_len)
3095 FAIL;
3096 }
3097 code += prefix_len;
3098 }
3099 /* Validate the charset */
3100 if (flags & SRE_INFO_CHARSET) {
3101 if (!_validate_charset(code, newcode-1))
3102 FAIL;
3103 if (newcode[-1] != SRE_OP_FAILURE)
3104 FAIL;
3105 code = newcode;
3106 }
3107 else if (code != newcode) {
3108 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3109 FAIL;
3110 }
3111 }
3112 break;
3113
3114 case SRE_OP_BRANCH:
3115 {
3116 SRE_CODE *target = NULL;
3117 for (;;) {
3118 GET_SKIP;
3119 if (skip == 0)
3120 break;
3121 /* Stop 2 before the end; we check the JUMP below */
3122 if (!_validate_inner(code, code+skip-3, groups))
3123 FAIL;
3124 code += skip-3;
3125 /* Check that it ends with a JUMP, and that each JUMP
3126 has the same target */
3127 GET_OP;
3128 if (op != SRE_OP_JUMP)
3129 FAIL;
3130 GET_SKIP;
3131 if (target == NULL)
3132 target = code+skip-1;
3133 else if (code+skip-1 != target)
3134 FAIL;
3135 }
3136 }
3137 break;
3138
3139 case SRE_OP_REPEAT_ONE:
3140 case SRE_OP_MIN_REPEAT_ONE:
3141 {
3142 SRE_CODE min, max;
3143 GET_SKIP;
3144 GET_ARG; min = arg;
3145 GET_ARG; max = arg;
3146 if (min > max)
3147 FAIL;
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003148 if (max > SRE_MAXREPEAT)
Guido van Rossum8b762f02008-08-05 03:39:21 +00003149 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003150 if (!_validate_inner(code, code+skip-4, groups))
3151 FAIL;
3152 code += skip-4;
3153 GET_OP;
3154 if (op != SRE_OP_SUCCESS)
3155 FAIL;
3156 }
3157 break;
3158
3159 case SRE_OP_REPEAT:
3160 {
3161 SRE_CODE min, max;
3162 GET_SKIP;
3163 GET_ARG; min = arg;
3164 GET_ARG; max = arg;
3165 if (min > max)
3166 FAIL;
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003167 if (max > SRE_MAXREPEAT)
Guido van Rossum8b762f02008-08-05 03:39:21 +00003168 FAIL;
Guido van Rossum8b762f02008-08-05 03:39:21 +00003169 if (!_validate_inner(code, code+skip-3, groups))
3170 FAIL;
3171 code += skip-3;
3172 GET_OP;
3173 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3174 FAIL;
3175 }
3176 break;
3177
3178 case SRE_OP_GROUPREF:
3179 case SRE_OP_GROUPREF_IGNORE:
3180 GET_ARG;
3181 if (arg >= groups)
3182 FAIL;
3183 break;
3184
3185 case SRE_OP_GROUPREF_EXISTS:
3186 /* The regex syntax for this is: '(?(group)then|else)', where
3187 'group' is either an integer group number or a group name,
3188 'then' and 'else' are sub-regexes, and 'else' is optional. */
3189 GET_ARG;
3190 if (arg >= groups)
3191 FAIL;
Guido van Rossume3c4fd92008-09-10 14:27:00 +00003192 GET_SKIP_ADJ(1);
Guido van Rossum8b762f02008-08-05 03:39:21 +00003193 code--; /* The skip is relative to the first arg! */
3194 /* There are two possibilities here: if there is both a 'then'
3195 part and an 'else' part, the generated code looks like:
3196
3197 GROUPREF_EXISTS
3198 <group>
3199 <skipyes>
3200 ...then part...
3201 JUMP
3202 <skipno>
3203 (<skipyes> jumps here)
3204 ...else part...
3205 (<skipno> jumps here)
3206
3207 If there is only a 'then' part, it looks like:
3208
3209 GROUPREF_EXISTS
3210 <group>
3211 <skip>
3212 ...then part...
3213 (<skip> jumps here)
3214
3215 There is no direct way to decide which it is, and we don't want
3216 to allow arbitrary jumps anywhere in the code; so we just look
3217 for a JUMP opcode preceding our skip target.
3218 */
Serhiy Storchaka616f2fe2013-04-13 21:15:10 +03003219 if (skip >= 3 && skip-3 < end-code &&
Guido van Rossum8b762f02008-08-05 03:39:21 +00003220 code[skip-3] == SRE_OP_JUMP)
3221 {
3222 VTRACE(("both then and else parts present\n"));
3223 if (!_validate_inner(code+1, code+skip-3, groups))
3224 FAIL;
3225 code += skip-2; /* Position after JUMP, at <skipno> */
3226 GET_SKIP;
3227 if (!_validate_inner(code, code+skip-1, groups))
3228 FAIL;
3229 code += skip-1;
3230 }
3231 else {
3232 VTRACE(("only a then part present\n"));
3233 if (!_validate_inner(code+1, code+skip-1, groups))
3234 FAIL;
3235 code += skip-1;
3236 }
3237 break;
3238
3239 case SRE_OP_ASSERT:
3240 case SRE_OP_ASSERT_NOT:
3241 GET_SKIP;
3242 GET_ARG; /* 0 for lookahead, width for lookbehind */
3243 code--; /* Back up over arg to simplify math below */
3244 if (arg & 0x80000000)
3245 FAIL; /* Width too large */
3246 /* Stop 1 before the end; we check the SUCCESS below */
3247 if (!_validate_inner(code+1, code+skip-2, groups))
3248 FAIL;
3249 code += skip-2;
3250 GET_OP;
3251 if (op != SRE_OP_SUCCESS)
3252 FAIL;
3253 break;
3254
3255 default:
3256 FAIL;
3257
3258 }
3259 }
3260
3261 VTRACE(("okay\n"));
3262 return 1;
3263}
3264
3265static int
3266_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3267{
3268 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3269 FAIL;
3270 if (groups == 0) /* fix for simplejson */
3271 groups = 100; /* 100 groups should always be safe */
3272 return _validate_inner(code, end-1, groups);
3273}
3274
3275static int
3276_validate(PatternObject *self)
3277{
3278 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3279 {
3280 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3281 return 0;
3282 }
3283 else
3284 VTRACE(("Success!\n"));
3285 return 1;
3286}
3287
3288/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003289/* match methods */
3290
3291static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003292match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003293{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003294 Py_XDECREF(self->regs);
3295 Py_XDECREF(self->string);
3296 Py_DECREF(self->pattern);
3297 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003298}
3299
3300static PyObject*
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003301match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003302{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 if (index < 0 || index >= self->groups) {
3304 /* raise IndexError if we were given a bad group number */
3305 PyErr_SetString(
3306 PyExc_IndexError,
3307 "no such group"
3308 );
3309 return NULL;
3310 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003311
Fredrik Lundh6f013982000-07-03 18:44:21 +00003312 index *= 2;
3313
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003314 if (self->string == Py_None || self->mark[index] < 0) {
3315 /* return default value if the string or group is undefined */
3316 Py_INCREF(def);
3317 return def;
3318 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003319
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003320 return PySequence_GetSlice(
3321 self->string, self->mark[index], self->mark[index+1]
3322 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003323}
3324
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003325static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003326match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003327{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003328 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003329
Benjamin Petersonbc4ece52014-09-30 22:04:28 -04003330 if (PyInt_Check(index) || PyLong_Check(index))
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003331 return PyInt_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003332
Fredrik Lundh6f013982000-07-03 18:44:21 +00003333 i = -1;
3334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003335 if (self->pattern->groupindex) {
3336 index = PyObject_GetItem(self->pattern->groupindex, index);
3337 if (index) {
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003338 if (PyInt_Check(index) || PyLong_Check(index))
3339 i = PyInt_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003340 Py_DECREF(index);
3341 } else
3342 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003343 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003344
3345 return i;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003346}
3347
3348static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003349match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003350{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003351 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003352}
3353
3354static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003355match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003356{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003357 /* delegate to Python code */
3358 return call(
Neal Norwitz94a9c092006-03-16 06:30:02 +00003359 SRE_PY_MODULE, "_expand",
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003360 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003361 );
3362}
3363
3364static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003365match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003366{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003367 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003368 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003370 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003371
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003372 switch (size) {
3373 case 0:
3374 result = match_getslice(self, Py_False, Py_None);
3375 break;
3376 case 1:
3377 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3378 break;
3379 default:
3380 /* fetch multiple items */
3381 result = PyTuple_New(size);
3382 if (!result)
3383 return NULL;
3384 for (i = 0; i < size; i++) {
3385 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003386 self, PyTuple_GET_ITEM(args, i), Py_None
3387 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003388 if (!item) {
3389 Py_DECREF(result);
3390 return NULL;
3391 }
3392 PyTuple_SET_ITEM(result, i, item);
3393 }
3394 break;
3395 }
3396 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003397}
3398
3399static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003400match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003401{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003402 PyObject* result;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003403 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003404
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003405 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003406 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003407 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003408 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003410 result = PyTuple_New(self->groups-1);
3411 if (!result)
3412 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003413
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003414 for (index = 1; index < self->groups; index++) {
3415 PyObject* item;
3416 item = match_getslice_by_index(self, index, def);
3417 if (!item) {
3418 Py_DECREF(result);
3419 return NULL;
3420 }
3421 PyTuple_SET_ITEM(result, index-1, item);
3422 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003423
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003424 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003425}
3426
3427static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003428match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003429{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003430 PyObject* result;
3431 PyObject* keys;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003432 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003433
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003434 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003435 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003436 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003437 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003438
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003439 result = PyDict_New();
3440 if (!result || !self->pattern->groupindex)
3441 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003442
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003443 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003444 if (!keys)
3445 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003446
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003447 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003448 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003449 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003450 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003451 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003452 if (!key)
3453 goto failed;
3454 value = match_getslice(self, key, def);
Benjamin Peterson0f3596a2016-08-15 22:01:41 -07003455 if (!value)
Fredrik Lundh770617b2001-01-14 15:06:11 +00003456 goto failed;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003457 status = PyDict_SetItem(result, key, value);
3458 Py_DECREF(value);
3459 if (status < 0)
3460 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003461 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003462
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003463 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003464
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003465 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003466
3467failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003468 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003469 Py_DECREF(result);
3470 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003471}
3472
3473static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003474match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003475{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003476 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003477
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003478 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003479 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003480 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003481
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003482 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003483
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003484 if (index < 0 || index >= self->groups) {
3485 PyErr_SetString(
3486 PyExc_IndexError,
3487 "no such group"
3488 );
3489 return NULL;
3490 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003491
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003492 /* mark is -1 if group is undefined */
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003493 return PyInt_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003494}
3495
3496static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003497match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003498{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003499 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003500
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003501 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003502 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003503 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003504
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003505 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003506
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003507 if (index < 0 || index >= self->groups) {
3508 PyErr_SetString(
3509 PyExc_IndexError,
3510 "no such group"
3511 );
3512 return NULL;
3513 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003514
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003515 /* mark is -1 if group is undefined */
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003516 return PyInt_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003517}
3518
3519LOCAL(PyObject*)
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003520_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003521{
3522 PyObject* pair;
3523 PyObject* item;
3524
3525 pair = PyTuple_New(2);
3526 if (!pair)
3527 return NULL;
3528
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003529 item = PyInt_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003530 if (!item)
3531 goto error;
3532 PyTuple_SET_ITEM(pair, 0, item);
3533
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003534 item = PyInt_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003535 if (!item)
3536 goto error;
3537 PyTuple_SET_ITEM(pair, 1, item);
3538
3539 return pair;
3540
3541 error:
3542 Py_DECREF(pair);
3543 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003544}
3545
3546static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003547match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003548{
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003549 Py_ssize_t index;
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003550
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003551 PyObject* index_ = Py_False; /* zero */
Georg Brandl96a8c392006-05-29 21:04:52 +00003552 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003553 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003554
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003555 index = match_getindex(self, index_);
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003556
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003557 if (index < 0 || index >= self->groups) {
3558 PyErr_SetString(
3559 PyExc_IndexError,
3560 "no such group"
3561 );
3562 return NULL;
3563 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003564
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003565 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003566 return _pair(self->mark[index*2], self->mark[index*2+1]);
3567}
3568
3569static PyObject*
3570match_regs(MatchObject* self)
3571{
3572 PyObject* regs;
3573 PyObject* item;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003574 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003575
3576 regs = PyTuple_New(self->groups);
3577 if (!regs)
3578 return NULL;
3579
3580 for (index = 0; index < self->groups; index++) {
3581 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3582 if (!item) {
3583 Py_DECREF(regs);
3584 return NULL;
3585 }
3586 PyTuple_SET_ITEM(regs, index, item);
3587 }
3588
3589 Py_INCREF(regs);
3590 self->regs = regs;
3591
3592 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003593}
3594
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003595static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003596match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003597{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003598#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003599 MatchObject* copy;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003600 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003601
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003602 slots = 2 * (self->pattern->groups+1);
3603
3604 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3605 if (!copy)
3606 return NULL;
3607
3608 /* this value a constant, but any compiler should be able to
3609 figure that out all by itself */
3610 offset = offsetof(MatchObject, string);
3611
3612 Py_XINCREF(self->pattern);
3613 Py_XINCREF(self->string);
3614 Py_XINCREF(self->regs);
3615
3616 memcpy((char*) copy + offset, (char*) self + offset,
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003617 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003618
3619 return (PyObject*) copy;
3620#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003621 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003622 return NULL;
3623#endif
3624}
3625
3626static PyObject*
Georg Brandlfbef5882006-05-28 22:14:04 +00003627match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003628{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003629#ifdef USE_BUILTIN_COPY
3630 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003631
Georg Brandlfbef5882006-05-28 22:14:04 +00003632 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003633 if (!copy)
3634 return NULL;
3635
3636 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3637 !deepcopy(&copy->string, memo) ||
3638 !deepcopy(&copy->regs, memo)) {
3639 Py_DECREF(copy);
3640 return NULL;
3641 }
3642
3643#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003644 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3645 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003646#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003647}
3648
Andrew Svetlov1c6c90f2012-12-23 20:09:01 +02003649PyDoc_STRVAR(match_doc,
3650"The result of re.match() and re.search().\n\
3651Match objects always have a boolean value of True.");
3652
3653PyDoc_STRVAR(match_group_doc,
3654"group([group1, ...]) -> str or tuple.\n\
3655 Return subgroup(s) of the match by indices or names.\n\
3656 For 0 returns the entire match.");
3657
3658PyDoc_STRVAR(match_start_doc,
3659"start([group=0]) -> int.\n\
3660 Return index of the start of the substring matched by group.");
3661
3662PyDoc_STRVAR(match_end_doc,
3663"end([group=0]) -> int.\n\
3664 Return index of the end of the substring matched by group.");
3665
3666PyDoc_STRVAR(match_span_doc,
3667"span([group]) -> tuple.\n\
3668 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3669
3670PyDoc_STRVAR(match_groups_doc,
3671"groups([default=None]) -> tuple.\n\
3672 Return a tuple containing all the subgroups of the match, from 1.\n\
3673 The default argument is used for groups\n\
3674 that did not participate in the match");
3675
3676PyDoc_STRVAR(match_groupdict_doc,
3677"groupdict([default=None]) -> dict.\n\
3678 Return a dictionary containing all the named subgroups of the match,\n\
3679 keyed by the subgroup name. The default argument is used for groups\n\
3680 that did not participate in the match");
3681
3682PyDoc_STRVAR(match_expand_doc,
3683"expand(template) -> str.\n\
3684 Return the string obtained by doing backslash substitution\n\
3685 on the string template, as done by the sub() method.");
3686
3687static PyMethodDef match_methods[] = {
3688 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3689 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3690 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3691 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3692 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3693 match_groups_doc},
3694 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3695 match_groupdict_doc},
3696 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Georg Brandlfbef5882006-05-28 22:14:04 +00003697 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3698 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003699 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003700};
3701
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003702static PyObject *
3703match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003704{
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003705 if (self->lastindex >= 0)
Benjamin Peterson9dccb012013-01-10 10:37:47 -06003706 return PyInt_FromSsize_t(self->lastindex);
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003707 Py_INCREF(Py_None);
3708 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003709}
3710
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003711static PyObject *
3712match_lastgroup_get(MatchObject *self)
3713{
3714 if (self->pattern->indexgroup && self->lastindex >= 0) {
3715 PyObject* result = PySequence_GetItem(
3716 self->pattern->indexgroup, self->lastindex
3717 );
3718 if (result)
3719 return result;
3720 PyErr_Clear();
3721 }
3722 Py_INCREF(Py_None);
3723 return Py_None;
3724}
3725
3726static PyObject *
3727match_regs_get(MatchObject *self)
3728{
3729 if (self->regs) {
3730 Py_INCREF(self->regs);
3731 return self->regs;
3732 } else
3733 return match_regs(self);
3734}
3735
3736static PyGetSetDef match_getset[] = {
3737 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3738 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3739 {"regs", (getter)match_regs_get, (setter)NULL},
3740 {NULL}
3741};
3742
3743#define MATCH_OFF(x) offsetof(MatchObject, x)
3744static PyMemberDef match_members[] = {
3745 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3746 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3747 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3748 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3749 {NULL}
3750};
3751
3752
Guido van Rossumb700df92000-03-31 14:59:30 +00003753/* FIXME: implement setattr("string", None) as a special case (to
3754 detach the associated string, if any */
3755
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003756static PyTypeObject Match_Type = {
3757 PyVarObject_HEAD_INIT(NULL, 0)
3758 "_" SRE_MODULE ".SRE_Match",
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003759 sizeof(MatchObject), sizeof(Py_ssize_t),
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003760 (destructor)match_dealloc, /* tp_dealloc */
3761 0, /* tp_print */
3762 0, /* tp_getattr */
3763 0, /* tp_setattr */
3764 0, /* tp_compare */
3765 0, /* tp_repr */
3766 0, /* tp_as_number */
3767 0, /* tp_as_sequence */
3768 0, /* tp_as_mapping */
3769 0, /* tp_hash */
3770 0, /* tp_call */
3771 0, /* tp_str */
3772 0, /* tp_getattro */
3773 0, /* tp_setattro */
3774 0, /* tp_as_buffer */
3775 Py_TPFLAGS_DEFAULT,
Andrew Svetlov1c6c90f2012-12-23 20:09:01 +02003776 match_doc, /* tp_doc */
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003777 0, /* tp_traverse */
3778 0, /* tp_clear */
3779 0, /* tp_richcompare */
3780 0, /* tp_weaklistoffset */
3781 0, /* tp_iter */
3782 0, /* tp_iternext */
3783 match_methods, /* tp_methods */
3784 match_members, /* tp_members */
3785 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003786};
3787
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003788static PyObject*
3789pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3790{
3791 /* create match object (from state object) */
3792
3793 MatchObject* match;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003794 Py_ssize_t i, j;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003795 char* base;
3796 int n;
3797
3798 if (status > 0) {
3799
3800 /* create match object (with room for extra group marks) */
Christian Heimes4956d2b2008-01-18 19:12:56 +00003801 /* coverity[ampersand_in_size] */
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003802 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3803 2*(pattern->groups+1));
3804 if (!match)
3805 return NULL;
3806
3807 Py_INCREF(pattern);
3808 match->pattern = pattern;
3809
3810 Py_INCREF(state->string);
3811 match->string = state->string;
3812
3813 match->regs = NULL;
3814 match->groups = pattern->groups+1;
3815
3816 /* fill in group slices */
3817
3818 base = (char*) state->beginning;
3819 n = state->charsize;
3820
3821 match->mark[0] = ((char*) state->start - base) / n;
3822 match->mark[1] = ((char*) state->ptr - base) / n;
3823
3824 for (i = j = 0; i < pattern->groups; i++, j+=2)
3825 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3826 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3827 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3828 } else
3829 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3830
3831 match->pos = state->pos;
3832 match->endpos = state->endpos;
3833
3834 match->lastindex = state->lastindex;
3835
3836 return (PyObject*) match;
3837
3838 } else if (status == 0) {
3839
3840 /* no match */
3841 Py_INCREF(Py_None);
3842 return Py_None;
3843
3844 }
3845
3846 /* internal error */
3847 pattern_error(status);
3848 return NULL;
3849}
3850
3851
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003852/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003853/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003854
3855static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003856scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003857{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003858 state_fini(&self->state);
Antoine Pitrouefdddd32010-01-14 17:25:24 +00003859 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003860 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003861}
3862
3863static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003864scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003865{
3866 SRE_STATE* state = &self->state;
3867 PyObject* match;
3868 int status;
3869
Serhiy Storchaka7865f212015-07-06 13:58:24 +03003870 if (state->start == NULL)
3871 Py_RETURN_NONE;
3872
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003873 state_reset(state);
3874
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003875 state->ptr = state->start;
3876
3877 if (state->charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003878 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003879 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003880#if defined(HAVE_UNICODE)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003881 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003882#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003883 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003884 if (PyErr_Occurred())
3885 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003886
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003887 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003888 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003889
Serhiy Storchaka7865f212015-07-06 13:58:24 +03003890 if (status == 0)
3891 state->start = NULL;
3892 else if (state->ptr != state->start)
3893 state->start = state->ptr;
3894 else if (state->ptr != state->end)
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003895 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003896 else
Serhiy Storchaka7865f212015-07-06 13:58:24 +03003897 state->start = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003898
3899 return match;
3900}
3901
3902
3903static PyObject*
Georg Brandl964f5972006-05-28 22:38:57 +00003904scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003905{
3906 SRE_STATE* state = &self->state;
3907 PyObject* match;
3908 int status;
3909
Serhiy Storchaka7865f212015-07-06 13:58:24 +03003910 if (state->start == NULL)
3911 Py_RETURN_NONE;
3912
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003913 state_reset(state);
3914
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003915 state->ptr = state->start;
3916
3917 if (state->charsize == 1) {
3918 status = sre_search(state, PatternObject_GetCode(self->pattern));
3919 } else {
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003920#if defined(HAVE_UNICODE)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003921 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
Fredrik Lundh436c3d52000-06-29 08:58:44 +00003922#endif
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003923 }
Andrew M. Kuchling36126c42006-10-04 13:42:43 +00003924 if (PyErr_Occurred())
3925 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003926
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003927 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003928 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003929
Serhiy Storchaka7865f212015-07-06 13:58:24 +03003930 if (status == 0)
3931 state->start = NULL;
3932 else if (state->ptr != state->start)
3933 state->start = state->ptr;
3934 else if (state->ptr != state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003935 state->start = (void*) ((char*) state->ptr + state->charsize);
3936 else
Serhiy Storchaka7865f212015-07-06 13:58:24 +03003937 state->start = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003938
3939 return match;
3940}
3941
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003942static PyMethodDef scanner_methods[] = {
Georg Brandlfbef5882006-05-28 22:14:04 +00003943 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3944 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003945 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003946};
3947
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003948#define SCAN_OFF(x) offsetof(ScannerObject, x)
3949static PyMemberDef scanner_members[] = {
3950 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3951 {NULL} /* Sentinel */
3952};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003953
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003954statichere PyTypeObject Scanner_Type = {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003955 PyObject_HEAD_INIT(NULL)
Fredrik Lundh82b23072001-12-09 16:13:15 +00003956 0, "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003957 sizeof(ScannerObject), 0,
3958 (destructor)scanner_dealloc, /*tp_dealloc*/
Benjamin Peterson6116d4a2011-05-17 18:31:20 -05003959 0, /* tp_print */
3960 0, /* tp_getattr */
3961 0, /* tp_setattr */
3962 0, /* tp_reserved */
3963 0, /* tp_repr */
3964 0, /* tp_as_number */
3965 0, /* tp_as_sequence */
3966 0, /* tp_as_mapping */
3967 0, /* tp_hash */
3968 0, /* tp_call */
3969 0, /* tp_str */
3970 0, /* tp_getattro */
3971 0, /* tp_setattro */
3972 0, /* tp_as_buffer */
3973 Py_TPFLAGS_DEFAULT, /* tp_flags */
3974 0, /* tp_doc */
3975 0, /* tp_traverse */
3976 0, /* tp_clear */
3977 0, /* tp_richcompare */
3978 0, /* tp_weaklistoffset */
3979 0, /* tp_iter */
3980 0, /* tp_iternext */
3981 scanner_methods, /* tp_methods */
3982 scanner_members, /* tp_members */
3983 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003984};
3985
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003986static PyObject*
3987pattern_scanner(PatternObject* pattern, PyObject* args)
3988{
3989 /* create search state object */
3990
3991 ScannerObject* self;
3992
3993 PyObject* string;
Neal Norwitza6d80fa2006-06-12 03:05:40 +00003994 Py_ssize_t start = 0;
3995 Py_ssize_t end = PY_SSIZE_T_MAX;
3996 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00003997 return NULL;
3998
3999 /* create scanner object */
4000 self = PyObject_NEW(ScannerObject, &Scanner_Type);
4001 if (!self)
4002 return NULL;
Antoine Pitrouefdddd32010-01-14 17:25:24 +00004003 self->pattern = NULL;
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00004004
4005 string = state_init(&self->state, pattern, string, start, end);
4006 if (!string) {
Antoine Pitrouefdddd32010-01-14 17:25:24 +00004007 Py_DECREF(self);
Anthony Baxteraefd8ca2006-04-12 04:26:11 +00004008 return NULL;
4009 }
4010
4011 Py_INCREF(pattern);
4012 self->pattern = (PyObject*) pattern;
4013
4014 return (PyObject*) self;
4015}
4016
Guido van Rossumb700df92000-03-31 14:59:30 +00004017static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00004018 {"compile", _compile, METH_VARARGS},
Georg Brandlfbef5882006-05-28 22:14:04 +00004019 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00004020 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00004021 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00004022};
4023
Tim Peters3d563502006-01-21 02:47:53 +00004024#if PY_VERSION_HEX < 0x02030000
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00004025DL_EXPORT(void) init_sre(void)
4026#else
Mark Hammond8235ea12002-07-19 06:55:41 +00004027PyMODINIT_FUNC init_sre(void)
Andrew M. Kuchlingc24fe362003-04-30 13:09:08 +00004028#endif
Guido van Rossumb700df92000-03-31 14:59:30 +00004029{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00004030 PyObject* m;
4031 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00004032 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00004033
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00004034 /* Patch object types */
Benjamin Petersone266d3e2010-04-06 03:34:09 +00004035 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
4036 PyType_Ready(&Scanner_Type))
4037 return;
Guido van Rossumb700df92000-03-31 14:59:30 +00004038
Fredrik Lundh1c5aa692001-01-16 07:37:30 +00004039 m = Py_InitModule("_" SRE_MODULE, _functions);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00004040 if (m == NULL)
4041 return;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00004042 d = PyModule_GetDict(m);
4043
Fredrik Lundh21009b92001-09-18 18:47:09 +00004044 x = PyInt_FromLong(SRE_MAGIC);
4045 if (x) {
4046 PyDict_SetItemString(d, "MAGIC", x);
4047 Py_DECREF(x);
4048 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00004049
Martin v. Löwis78e2f062003-04-19 12:56:08 +00004050 x = PyInt_FromLong(sizeof(SRE_CODE));
4051 if (x) {
4052 PyDict_SetItemString(d, "CODESIZE", x);
4053 Py_DECREF(x);
4054 }
4055
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02004056 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
4057 if (x) {
4058 PyDict_SetItemString(d, "MAXREPEAT", x);
4059 Py_DECREF(x);
4060 }
4061
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004062 x = PyString_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00004063 if (x) {
4064 PyDict_SetItemString(d, "copyright", x);
4065 Py_DECREF(x);
4066 }
Guido van Rossumb700df92000-03-31 14:59:30 +00004067}
4068
Fredrik Lundh436c3d52000-06-29 08:58:44 +00004069#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00004070
4071/* vim:ts=4:sw=4:et
4072*/