blob: d6fcda18b694dbbecb13f15d2d942483e4635162 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000038static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000044#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000045
46#include "sre.h"
47
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030048#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
49
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000050#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000051
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000053#if !defined(SRE_MODULE)
54#define SRE_MODULE "sre"
55#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000056
Thomas Wouters9ada3d62006-04-21 09:47:09 +000057#define SRE_PY_MODULE "re"
58
Guido van Rossumb700df92000-03-31 14:59:30 +000059/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000060#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000063/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064
65/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000066#define USE_FAST_SEARCH
67
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000068/* enables copy/deepcopy handling (work in progress) */
69#undef USE_BUILTIN_COPY
70
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071/* -------------------------------------------------------------------- */
72
Fredrik Lundh80946112000-06-29 18:03:25 +000073#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000074#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000075#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000076/* fastest possible local call under MSVC */
77#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000078#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000079#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#else
81#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000082#endif
83
84/* error codes */
85#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000086#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000087#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000088#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000089#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000090
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000092#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000093#else
94#define TRACE(v)
95#endif
96
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000097/* -------------------------------------------------------------------- */
98/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000100/* default character predicates (run sre_chars.py to regenerate tables) */
101
102#define SRE_DIGIT_MASK 1
103#define SRE_SPACE_MASK 2
104#define SRE_LINEBREAK_MASK 4
105#define SRE_ALNUM_MASK 8
106#define SRE_WORD_MASK 16
107
Fredrik Lundh21009b92001-09-18 18:47:09 +0000108/* FIXME: this assumes ASCII. create tables in init_sre() instead */
109
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000110static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1112, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11325, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1150, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
117
Fredrik Lundhb389df32000-06-29 12:48:37 +0000118static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000011910, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12027, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12144, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12261, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
123108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
124122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
125106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
126120, 121, 122, 123, 124, 125, 126, 127 };
127
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000128#define SRE_IS_DIGIT(ch)\
129 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
130#define SRE_IS_SPACE(ch)\
131 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
132#define SRE_IS_LINEBREAK(ch)\
133 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
134#define SRE_IS_ALNUM(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
136#define SRE_IS_WORD(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000138
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000139static unsigned int sre_lower(unsigned int ch)
140{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000141 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000142}
143
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000144/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000145/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
146 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000148#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
149
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150static unsigned int sre_lower_locale(unsigned int ch)
151{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000152 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000153}
154
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000155/* unicode-specific character predicates */
156
Victor Stinner0058b862011-09-29 03:27:47 +0200157#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
158#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
159#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
160#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
161#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162
163static unsigned int sre_lower_unicode(unsigned int ch)
164{
Victor Stinner0058b862011-09-29 03:27:47 +0200165 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166}
167
Guido van Rossumb700df92000-03-31 14:59:30 +0000168LOCAL(int)
169sre_category(SRE_CODE category, unsigned int ch)
170{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000171 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000172
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000173 case SRE_CATEGORY_DIGIT:
174 return SRE_IS_DIGIT(ch);
175 case SRE_CATEGORY_NOT_DIGIT:
176 return !SRE_IS_DIGIT(ch);
177 case SRE_CATEGORY_SPACE:
178 return SRE_IS_SPACE(ch);
179 case SRE_CATEGORY_NOT_SPACE:
180 return !SRE_IS_SPACE(ch);
181 case SRE_CATEGORY_WORD:
182 return SRE_IS_WORD(ch);
183 case SRE_CATEGORY_NOT_WORD:
184 return !SRE_IS_WORD(ch);
185 case SRE_CATEGORY_LINEBREAK:
186 return SRE_IS_LINEBREAK(ch);
187 case SRE_CATEGORY_NOT_LINEBREAK:
188 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 case SRE_CATEGORY_LOC_WORD:
191 return SRE_LOC_IS_WORD(ch);
192 case SRE_CATEGORY_LOC_NOT_WORD:
193 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195 case SRE_CATEGORY_UNI_DIGIT:
196 return SRE_UNI_IS_DIGIT(ch);
197 case SRE_CATEGORY_UNI_NOT_DIGIT:
198 return !SRE_UNI_IS_DIGIT(ch);
199 case SRE_CATEGORY_UNI_SPACE:
200 return SRE_UNI_IS_SPACE(ch);
201 case SRE_CATEGORY_UNI_NOT_SPACE:
202 return !SRE_UNI_IS_SPACE(ch);
203 case SRE_CATEGORY_UNI_WORD:
204 return SRE_UNI_IS_WORD(ch);
205 case SRE_CATEGORY_UNI_NOT_WORD:
206 return !SRE_UNI_IS_WORD(ch);
207 case SRE_CATEGORY_UNI_LINEBREAK:
208 return SRE_UNI_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
210 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 }
212 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000213}
214
215/* helpers */
216
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000217static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000218data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000219{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000220 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000223 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000224 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000225}
226
227static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000228data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000229{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000230 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 minsize = state->data_stack_base+size;
232 cursize = state->data_stack_size;
233 if (cursize < minsize) {
234 void* stack;
235 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300236 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000239 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000240 return SRE_ERROR_MEMORY;
241 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000242 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000243 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000244 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000246}
247
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000248/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000249
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300250#define SRE_CHAR Py_UCS1
251#define SIZEOF_SRE_CHAR 1
252#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300253#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000254
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300255/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000256
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300257#define SRE_CHAR Py_UCS2
258#define SIZEOF_SRE_CHAR 2
259#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300260#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000261
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300262/* generate 32-bit unicode version */
263
264#define SRE_CHAR Py_UCS4
265#define SIZEOF_SRE_CHAR 4
266#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300267#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000268
269/* -------------------------------------------------------------------- */
270/* factories and destructors */
271
272/* see sre.h for object declarations */
Victor Stinnerf5587782013-11-15 23:21:11 +0100273static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600274static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +0000275
276static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000277sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +0000278{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +0100279 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000282static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +0000283sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000284{
285 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000286 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000287 return NULL;
288 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000289 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000290 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000291 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +0000292 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293}
294
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000295LOCAL(void)
296state_reset(SRE_STATE* state)
297{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000298 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000299 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000300
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000301 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000302 state->lastindex = -1;
303
304 state->repeat = NULL;
305
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000306 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000307}
308
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000309static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200310getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300311 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600312 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000313{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000314 /* given a python object, return a data pointer, a length (in
315 characters), and a character size. return NULL if the object
316 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000317
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000318 /* Unicode objects do not support the buffer API. So, get the data
319 directly instead. */
320 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 if (PyUnicode_READY(string) == -1)
322 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200323 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200324 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300325 *p_isbytes = 0;
326 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000327 }
328
Victor Stinner0058b862011-09-29 03:27:47 +0200329 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300330 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
331 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
332 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000333 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000334
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300335 *p_length = view->len;
336 *p_charsize = 1;
337 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000338
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300339 if (view->buf == NULL) {
340 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
341 PyBuffer_Release(view);
342 view->buf = NULL;
343 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300345 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000346}
347
348LOCAL(PyObject*)
349state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000350 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000351{
352 /* prepare state object */
353
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000354 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300355 int isbytes, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000356 void* ptr;
357
358 memset(state, 0, sizeof(SRE_STATE));
359
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000360 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000361 state->lastindex = -1;
362
Benjamin Petersone48944b2012-03-07 14:50:25 -0600363 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300364 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000365 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600366 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000367
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300368 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600369 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300370 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600371 goto err;
372 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300373 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600374 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300375 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600376 goto err;
377 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000378
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000379 /* adjust boundaries */
380 if (start < 0)
381 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000382 else if (start > length)
383 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000384
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000385 if (end < 0)
386 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000387 else if (end > length)
388 end = length;
389
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300390 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000391 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 state->start = (void*) ((char*) ptr + start * state->charsize);
396 state->end = (void*) ((char*) ptr + end * state->charsize);
397
398 Py_INCREF(string);
399 state->string = string;
400 state->pos = start;
401 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000402
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000403 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000404 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000405 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000406 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000407 else
Fredrik Lundhb389df32000-06-29 12:48:37 +0000408 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000410 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600411 err:
412 if (state->buffer.buf)
413 PyBuffer_Release(&state->buffer);
414 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000415}
416
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000417LOCAL(void)
418state_fini(SRE_STATE* state)
419{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600420 if (state->buffer.buf)
421 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000423 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000424}
425
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000426/* calculate offset from start of string */
427#define STATE_OFFSET(state, member)\
428 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
429
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000430LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300431getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300432 PyObject* string, Py_ssize_t start, Py_ssize_t end)
433{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300434 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300435 if (PyBytes_CheckExact(string) &&
436 start == 0 && end == PyBytes_GET_SIZE(string)) {
437 Py_INCREF(string);
438 return string;
439 }
440 return PyBytes_FromStringAndSize(
441 (const char *)ptr + start, end - start);
442 }
443 else {
444 return PyUnicode_Substring(string, start, end);
445 }
446}
447
448LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000449state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000451 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000452
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000453 index = (index - 1) * 2;
454
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000455 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000456 if (empty)
457 /* want empty string */
458 i = j = 0;
459 else {
460 Py_INCREF(Py_None);
461 return Py_None;
462 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000463 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000464 i = STATE_OFFSET(state, state->mark[index]);
465 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000466 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000467
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300468 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000469}
470
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000471static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100472pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000473{
474 switch (status) {
475 case SRE_ERROR_RECURSION_LIMIT:
476 PyErr_SetString(
477 PyExc_RuntimeError,
478 "maximum recursion limit exceeded"
479 );
480 break;
481 case SRE_ERROR_MEMORY:
482 PyErr_NoMemory();
483 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000484 case SRE_ERROR_INTERRUPTED:
485 /* An exception has already been raised, so let it fly */
486 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000487 default:
488 /* other error codes indicate compiler/engine bugs */
489 PyErr_SetString(
490 PyExc_RuntimeError,
491 "internal error in regular expression engine"
492 );
493 }
494}
495
Guido van Rossumb700df92000-03-31 14:59:30 +0000496static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000497pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000498{
Raymond Hettinger027bb632004-05-31 03:09:25 +0000499 if (self->weakreflist != NULL)
500 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 Py_XDECREF(self->pattern);
502 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000503 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +0000505}
506
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300507LOCAL(Py_ssize_t)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300508sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300509{
510 if (state->charsize == 1)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300511 return sre_ucs1_match(state, pattern, match_all);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300512 if (state->charsize == 2)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300513 return sre_ucs2_match(state, pattern, match_all);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300514 assert(state->charsize == 4);
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300515 return sre_ucs4_match(state, pattern, match_all);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300516}
517
518LOCAL(Py_ssize_t)
519sre_search(SRE_STATE* state, SRE_CODE* pattern)
520{
521 if (state->charsize == 1)
522 return sre_ucs1_search(state, pattern);
523 if (state->charsize == 2)
524 return sre_ucs2_search(state, pattern);
525 assert(state->charsize == 4);
526 return sre_ucs4_search(state, pattern);
527}
528
Larry Hastings16c51912014-01-07 11:53:01 -0800529static PyObject *
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200530fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
531{
532 if (string2 != NULL) {
533 if (string != NULL) {
534 PyErr_Format(PyExc_TypeError,
535 "Argument given by name ('%s') and position (1)",
536 oldname);
537 return NULL;
538 }
539 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
540 "The '%s' keyword parameter name is deprecated. "
541 "Use 'string' instead.", oldname) < 0)
542 return NULL;
543 return string2;
544 }
545 if (string == NULL) {
546 PyErr_SetString(PyExc_TypeError,
547 "Required argument 'string' (pos 1) not found");
548 return NULL;
549 }
550 return string;
551}
Larry Hastings16c51912014-01-07 11:53:01 -0800552
553static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -0800554pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
Larry Hastings16c51912014-01-07 11:53:01 -0800555{
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200556 static char *_keywords[] = {"string", "pos", "endpos", "pattern", NULL};
557 PyObject *string = NULL;
Larry Hastings16c51912014-01-07 11:53:01 -0800558 Py_ssize_t pos = 0;
559 Py_ssize_t endpos = PY_SSIZE_T_MAX;
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200560 PyObject *pattern = NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000561 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100562 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +0000563
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200564 if (!PyArg_ParseTupleAndKeywords(args, kwargs,
565 "|Onn$O:match", _keywords,
566 &string, &pos, &endpos, &pattern))
567 return NULL;
568 string = fix_string_param(string, pattern, "pattern");
569 if (!string)
570 return NULL;
571 string = state_init(&state, (PatternObject *)self, string, pos, endpos);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000572 if (!string)
573 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000574
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000575 state.ptr = state.start;
576
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000577 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
578
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300579 status = sre_match(&state, PatternObject_GetCode(self), 0);
Guido van Rossumb700df92000-03-31 14:59:30 +0000580
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000581 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +0000582 if (PyErr_Occurred())
583 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000585 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000586
Larry Hastings16c51912014-01-07 11:53:01 -0800587 return (PyObject *)pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +0000588}
589
590static PyObject*
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200591pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
592{
593 SRE_STATE state;
594 Py_ssize_t status;
595
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200596 PyObject *string = NULL, *string2 = NULL;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200597 Py_ssize_t start = 0;
598 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200599 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200600 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:fullmatch", kwlist,
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200601 &string, &start, &end, &string2))
602 return NULL;
603
604 string = fix_string_param(string, string2, "pattern");
605 if (!string)
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200606 return NULL;
607
608 string = state_init(&state, self, string, start, end);
609 if (!string)
610 return NULL;
611
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200612 state.ptr = state.start;
613
614 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
615
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300616 status = sre_match(&state, PatternObject_GetCode(self), 1);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200617
618 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
619 if (PyErr_Occurred())
620 return NULL;
621
622 state_fini(&state);
623
624 return pattern_new_match(self, &state, status);
625}
626
627static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000628pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000629{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000630 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100631 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +0000632
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200633 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000634 Py_ssize_t start = 0;
635 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200636 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
637 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:search", kwlist,
638 &string, &start, &end, &string2))
639 return NULL;
640
641 string = fix_string_param(string, string2, "pattern");
642 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000643 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000644
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000645 string = state_init(&state, self, string, start, end);
646 if (!string)
647 return NULL;
648
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000649 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
650
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300651 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000652
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000653 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
654
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000655 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000656
Thomas Wouters89f507f2006-12-13 04:49:30 +0000657 if (PyErr_Occurred())
658 return NULL;
659
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000660 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +0000661}
662
663static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000664call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000665{
666 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000667 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000668 PyObject* func;
669 PyObject* result;
670
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000671 if (!args)
672 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000673 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000674 if (!name)
675 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000676 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000677 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000678 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000679 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000680 func = PyObject_GetAttrString(mod, function);
681 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000682 if (!func)
683 return NULL;
684 result = PyObject_CallObject(func, args);
685 Py_DECREF(func);
686 Py_DECREF(args);
687 return result;
688}
689
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000690#ifdef USE_BUILTIN_COPY
691static int
692deepcopy(PyObject** object, PyObject* memo)
693{
694 PyObject* copy;
695
696 copy = call(
697 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +0000698 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000699 );
700 if (!copy)
701 return 0;
702
703 Py_DECREF(*object);
704 *object = copy;
705
706 return 1; /* success */
707}
708#endif
709
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000710static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000711pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000712{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000713 SRE_STATE state;
714 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100715 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000716 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000717
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200718 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000719 Py_ssize_t start = 0;
720 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200721 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
722 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:findall", kwlist,
723 &string, &start, &end, &string2))
724 return NULL;
725
726 string = fix_string_param(string, string2, "source");
727 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000728 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000729
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000730 string = state_init(&state, self, string, start, end);
731 if (!string)
732 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000733
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000734 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000735 if (!list) {
736 state_fini(&state);
737 return NULL;
738 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000739
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000740 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000741
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000742 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000743
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000744 state_reset(&state);
745
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000746 state.ptr = state.start;
747
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300748 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300749 if (PyErr_Occurred())
750 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000751
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000752 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000753 if (status == 0)
754 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000755 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000756 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000757 }
Tim Peters3d563502006-01-21 02:47:53 +0000758
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000759 /* don't bother to build a match object */
760 switch (self->groups) {
761 case 0:
762 b = STATE_OFFSET(&state, state.start);
763 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300764 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300765 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000766 if (!item)
767 goto error;
768 break;
769 case 1:
770 item = state_getslice(&state, 1, string, 1);
771 if (!item)
772 goto error;
773 break;
774 default:
775 item = PyTuple_New(self->groups);
776 if (!item)
777 goto error;
778 for (i = 0; i < self->groups; i++) {
779 PyObject* o = state_getslice(&state, i+1, string, 1);
780 if (!o) {
781 Py_DECREF(item);
782 goto error;
783 }
784 PyTuple_SET_ITEM(item, i, o);
785 }
786 break;
787 }
788
789 status = PyList_Append(list, item);
790 Py_DECREF(item);
791 if (status < 0)
792 goto error;
793
794 if (state.ptr == state.start)
795 state.start = (void*) ((char*) state.ptr + state.charsize);
796 else
797 state.start = state.ptr;
798
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000799 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000800
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000801 state_fini(&state);
802 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000803
804error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000805 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000806 state_fini(&state);
807 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000808
Guido van Rossumb700df92000-03-31 14:59:30 +0000809}
810
Fredrik Lundh703ce812001-10-24 22:16:30 +0000811static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600812pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +0000813{
814 PyObject* scanner;
815 PyObject* search;
816 PyObject* iterator;
817
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600818 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000819 if (!scanner)
820 return NULL;
821
822 search = PyObject_GetAttrString(scanner, "search");
823 Py_DECREF(scanner);
824 if (!search)
825 return NULL;
826
827 iterator = PyCallIter_New(search, Py_None);
828 Py_DECREF(search);
829
830 return iterator;
831}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000832
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000833static PyObject*
834pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
835{
836 SRE_STATE state;
837 PyObject* list;
838 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100839 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000840 Py_ssize_t n;
841 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000842 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000843
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200844 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000845 Py_ssize_t maxsplit = 0;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200846 static char* kwlist[] = { "string", "maxsplit", "source", NULL };
847 if (!PyArg_ParseTupleAndKeywords(args, kw, "|On$O:split", kwlist,
848 &string, &maxsplit, &string2))
849 return NULL;
850
851 string = fix_string_param(string, string2, "source");
852 if (!string)
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000853 return NULL;
854
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000855 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000856 if (!string)
857 return NULL;
858
859 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000860 if (!list) {
861 state_fini(&state);
862 return NULL;
863 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000864
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000865 n = 0;
866 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000867
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000868 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000869
870 state_reset(&state);
871
872 state.ptr = state.start;
873
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300874 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300875 if (PyErr_Occurred())
876 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000877
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000878 if (status <= 0) {
879 if (status == 0)
880 break;
881 pattern_error(status);
882 goto error;
883 }
Tim Peters3d563502006-01-21 02:47:53 +0000884
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000885 if (state.start == state.ptr) {
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +0300886 if (last == state.end || state.ptr == state.end)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000887 break;
888 /* skip one character */
889 state.start = (void*) ((char*) state.ptr + state.charsize);
890 continue;
891 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000892
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000893 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300894 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000895 string, STATE_OFFSET(&state, last),
896 STATE_OFFSET(&state, state.start)
897 );
898 if (!item)
899 goto error;
900 status = PyList_Append(list, item);
901 Py_DECREF(item);
902 if (status < 0)
903 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000904
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000905 /* add groups (if any) */
906 for (i = 0; i < self->groups; i++) {
907 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000908 if (!item)
909 goto error;
910 status = PyList_Append(list, item);
911 Py_DECREF(item);
912 if (status < 0)
913 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000914 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000915
916 n = n + 1;
917
918 last = state.start = state.ptr;
919
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000920 }
921
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000922 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300923 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000924 string, STATE_OFFSET(&state, last), state.endpos
925 );
926 if (!item)
927 goto error;
928 status = PyList_Append(list, item);
929 Py_DECREF(item);
930 if (status < 0)
931 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000932
933 state_fini(&state);
934 return list;
935
936error:
937 Py_DECREF(list);
938 state_fini(&state);
939 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000940
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000941}
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000942
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000943static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000944pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000945 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000946{
947 SRE_STATE state;
948 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +0300949 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000950 PyObject* item;
951 PyObject* filter;
952 PyObject* args;
953 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000954 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100955 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000956 Py_ssize_t n;
957 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300958 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000959 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600960 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000961
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000962 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +0000963 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000964 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +0000965 Py_INCREF(filter);
966 filter_is_callable = 1;
967 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000968 /* if not callable, check if it's a literal string */
969 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600970 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300971 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000973 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300974 if (charsize == 1)
975 literal = memchr(ptr, '\\', n) == NULL;
976 else
977 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000978 } else {
979 PyErr_Clear();
980 literal = 0;
981 }
Benjamin Petersone48944b2012-03-07 14:50:25 -0600982 if (view.buf)
983 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000984 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000985 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000986 Py_INCREF(filter);
987 filter_is_callable = 0;
988 } else {
989 /* not a literal; hand it over to the template compiler */
990 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +0000991 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000992 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000993 );
994 if (!filter)
995 return NULL;
996 filter_is_callable = PyCallable_Check(filter);
997 }
Fredrik Lundhdac58492001-10-21 21:48:30 +0000998 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000999
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001000 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001001 if (!string) {
1002 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001003 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001004 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001005
1006 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001007 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001008 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001009 state_fini(&state);
1010 return NULL;
1011 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001012
1013 n = i = 0;
1014
1015 while (!count || n < count) {
1016
1017 state_reset(&state);
1018
1019 state.ptr = state.start;
1020
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001021 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001022 if (PyErr_Occurred())
1023 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001024
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001025 if (status <= 0) {
1026 if (status == 0)
1027 break;
1028 pattern_error(status);
1029 goto error;
1030 }
Tim Peters3d563502006-01-21 02:47:53 +00001031
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001032 b = STATE_OFFSET(&state, state.start);
1033 e = STATE_OFFSET(&state, state.ptr);
1034
1035 if (i < b) {
1036 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001037 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001038 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001039 if (!item)
1040 goto error;
1041 status = PyList_Append(list, item);
1042 Py_DECREF(item);
1043 if (status < 0)
1044 goto error;
1045
1046 } else if (i == b && i == e && n > 0)
1047 /* ignore empty match on latest position */
1048 goto next;
1049
1050 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001051 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001052 match = pattern_new_match(self, &state, 1);
1053 if (!match)
1054 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001055 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001056 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00001057 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001058 goto error;
1059 }
1060 item = PyObject_CallObject(filter, args);
1061 Py_DECREF(args);
1062 Py_DECREF(match);
1063 if (!item)
1064 goto error;
1065 } else {
1066 /* filter is literal string */
1067 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001068 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001069 }
1070
1071 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001072 if (item != Py_None) {
1073 status = PyList_Append(list, item);
1074 Py_DECREF(item);
1075 if (status < 0)
1076 goto error;
1077 }
Tim Peters3d563502006-01-21 02:47:53 +00001078
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001079 i = e;
1080 n = n + 1;
1081
1082next:
1083 /* move on */
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03001084 if (state.ptr == state.end)
1085 break;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001086 if (state.ptr == state.start)
1087 state.start = (void*) ((char*) state.ptr + state.charsize);
1088 else
1089 state.start = state.ptr;
1090
1091 }
1092
1093 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001094 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001095 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001096 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001097 if (!item)
1098 goto error;
1099 status = PyList_Append(list, item);
1100 Py_DECREF(item);
1101 if (status < 0)
1102 goto error;
1103 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001104
1105 state_fini(&state);
1106
Guido van Rossum4e173842001-12-07 04:25:10 +00001107 Py_DECREF(filter);
1108
Fredrik Lundhdac58492001-10-21 21:48:30 +00001109 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001110 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001111 if (!joiner) {
1112 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001113 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001114 }
1115 if (PyList_GET_SIZE(list) == 0) {
1116 Py_DECREF(list);
1117 item = joiner;
1118 }
1119 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001120 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001121 item = _PyBytes_Join(joiner, list);
1122 else
1123 item = PyUnicode_Join(joiner, list);
1124 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001125 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001126 if (!item)
1127 return NULL;
1128 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001129
1130 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001131 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001132
1133 return item;
1134
1135error:
1136 Py_DECREF(list);
1137 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001138 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001139 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001140
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001141}
1142
1143static PyObject*
1144pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
1145{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001146 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001147 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001148 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001149 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001150 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001151 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001152 return NULL;
1153
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001154 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001155}
1156
1157static PyObject*
1158pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
1159{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001160 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001161 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001162 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001163 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001164 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001165 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001166 return NULL;
1167
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001168 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001169}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001170
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001171static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001172pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001173{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001174#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001175 PatternObject* copy;
1176 int offset;
1177
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001178 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1179 if (!copy)
1180 return NULL;
1181
1182 offset = offsetof(PatternObject, groups);
1183
1184 Py_XINCREF(self->groupindex);
1185 Py_XINCREF(self->indexgroup);
1186 Py_XINCREF(self->pattern);
1187
1188 memcpy((char*) copy + offset, (char*) self + offset,
1189 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00001190 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001191
1192 return (PyObject*) copy;
1193#else
1194 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1195 return NULL;
1196#endif
1197}
1198
1199static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001200pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001201{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001202#ifdef USE_BUILTIN_COPY
1203 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00001204
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001205 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001206 if (!copy)
1207 return NULL;
1208
1209 if (!deepcopy(&copy->groupindex, memo) ||
1210 !deepcopy(&copy->indexgroup, memo) ||
1211 !deepcopy(&copy->pattern, memo)) {
1212 Py_DECREF(copy);
1213 return NULL;
1214 }
1215
1216#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001217 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1218 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001219#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001220}
1221
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001222static PyObject *
1223pattern_repr(PatternObject *obj)
1224{
1225 static const struct {
1226 const char *name;
1227 int value;
1228 } flag_names[] = {
1229 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1230 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1231 {"re.LOCALE", SRE_FLAG_LOCALE},
1232 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1233 {"re.DOTALL", SRE_FLAG_DOTALL},
1234 {"re.UNICODE", SRE_FLAG_UNICODE},
1235 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1236 {"re.DEBUG", SRE_FLAG_DEBUG},
1237 {"re.ASCII", SRE_FLAG_ASCII},
1238 };
1239 PyObject *result = NULL;
1240 PyObject *flag_items;
1241 int i;
1242 int flags = obj->flags;
1243
1244 /* Omit re.UNICODE for valid string patterns. */
1245 if (obj->isbytes == 0 &&
1246 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1247 SRE_FLAG_UNICODE)
1248 flags &= ~SRE_FLAG_UNICODE;
1249
1250 flag_items = PyList_New(0);
1251 if (!flag_items)
1252 return NULL;
1253
1254 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1255 if (flags & flag_names[i].value) {
1256 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1257 if (!item)
1258 goto done;
1259
1260 if (PyList_Append(flag_items, item) < 0) {
1261 Py_DECREF(item);
1262 goto done;
1263 }
1264 Py_DECREF(item);
1265 flags &= ~flag_names[i].value;
1266 }
1267 }
1268 if (flags) {
1269 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1270 if (!item)
1271 goto done;
1272
1273 if (PyList_Append(flag_items, item) < 0) {
1274 Py_DECREF(item);
1275 goto done;
1276 }
1277 Py_DECREF(item);
1278 }
1279
1280 if (PyList_Size(flag_items) > 0) {
1281 PyObject *flags_result;
1282 PyObject *sep = PyUnicode_FromString("|");
1283 if (!sep)
1284 goto done;
1285 flags_result = PyUnicode_Join(sep, flag_items);
1286 Py_DECREF(sep);
1287 if (!flags_result)
1288 goto done;
1289 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1290 obj->pattern, flags_result);
1291 Py_DECREF(flags_result);
1292 }
1293 else {
1294 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1295 }
1296
1297done:
1298 Py_DECREF(flag_items);
1299 return result;
1300}
1301
Raymond Hettinger94478742004-09-24 04:31:19 +00001302PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001303"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001304 Matches zero or more characters at the beginning of the string");
1305
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001306PyDoc_STRVAR(pattern_fullmatch_doc,
1307"fullmatch(string[, pos[, endpos]]) -> match object or None.\n\
1308 Matches against all of the string");
1309
Raymond Hettinger94478742004-09-24 04:31:19 +00001310PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001311"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001312 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02001313 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001314
1315PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001316"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001317 Split string by the occurrences of pattern.");
1318
1319PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001320"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001321 Return a list of all non-overlapping matches of pattern in string.");
1322
1323PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001324"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001325 Return an iterator over all non-overlapping matches for the \n\
1326 RE pattern in string. For each match, the iterator returns a\n\
1327 match object.");
1328
1329PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001330"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001331 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00001332 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001333
1334PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001335"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001336 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
1337 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00001338 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001339
1340PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
1341
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001342static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00001343 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001344 pattern_match_doc},
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001345 {"fullmatch", (PyCFunction) pattern_fullmatch, METH_VARARGS|METH_KEYWORDS,
1346 pattern_fullmatch_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001347 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001348 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001349 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001350 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001351 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001352 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001353 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001354 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001355 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001356 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001357 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001358 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001359 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001360 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
1361 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001362 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001363};
1364
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00001365#define PAT_OFF(x) offsetof(PatternObject, x)
1366static PyMemberDef pattern_members[] = {
1367 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
1368 {"flags", T_INT, PAT_OFF(flags), READONLY},
1369 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
1370 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
1371 {NULL} /* Sentinel */
1372};
Guido van Rossumb700df92000-03-31 14:59:30 +00001373
Neal Norwitz57c179c2006-03-22 07:18:02 +00001374static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001375 PyVarObject_HEAD_INIT(NULL, 0)
1376 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001377 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001378 (destructor)pattern_dealloc, /* tp_dealloc */
1379 0, /* tp_print */
1380 0, /* tp_getattr */
1381 0, /* tp_setattr */
1382 0, /* tp_reserved */
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001383 (reprfunc)pattern_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001384 0, /* tp_as_number */
1385 0, /* tp_as_sequence */
1386 0, /* tp_as_mapping */
1387 0, /* tp_hash */
1388 0, /* tp_call */
1389 0, /* tp_str */
1390 0, /* tp_getattro */
1391 0, /* tp_setattro */
1392 0, /* tp_as_buffer */
1393 Py_TPFLAGS_DEFAULT, /* tp_flags */
1394 pattern_doc, /* tp_doc */
1395 0, /* tp_traverse */
1396 0, /* tp_clear */
1397 0, /* tp_richcompare */
1398 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
1399 0, /* tp_iter */
1400 0, /* tp_iternext */
1401 pattern_methods, /* tp_methods */
1402 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00001403};
1404
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001405static int _validate(PatternObject *self); /* Forward */
1406
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001407static PyObject *
1408_compile(PyObject* self_, PyObject* args)
1409{
1410 /* "compile" pattern descriptor to pattern object */
1411
1412 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001413 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001414
1415 PyObject* pattern;
1416 int flags = 0;
1417 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001418 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419 PyObject* groupindex = NULL;
1420 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001421
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001422 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423 &PyList_Type, &code, &groups,
1424 &groupindex, &indexgroup))
1425 return NULL;
1426
1427 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001428 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1430 if (!self)
1431 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001432 self->weakreflist = NULL;
1433 self->pattern = NULL;
1434 self->groupindex = NULL;
1435 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001436
1437 self->codesize = n;
1438
1439 for (i = 0; i < n; i++) {
1440 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001441 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001442 self->code[i] = (SRE_CODE) value;
1443 if ((unsigned long) self->code[i] != value) {
1444 PyErr_SetString(PyExc_OverflowError,
1445 "regular expression code size limit exceeded");
1446 break;
1447 }
1448 }
1449
1450 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001451 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001452 return NULL;
1453 }
1454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001456 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 else {
1459 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001460 int charsize;
1461 Py_buffer view;
1462 view.buf = NULL;
1463 if (!getstring(pattern, &p_length, &self->isbytes,
1464 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 Py_DECREF(self);
1466 return NULL;
1467 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001468 if (view.buf)
1469 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001471
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472 Py_INCREF(pattern);
1473 self->pattern = pattern;
1474
1475 self->flags = flags;
1476
1477 self->groups = groups;
1478
1479 Py_XINCREF(groupindex);
1480 self->groupindex = groupindex;
1481
1482 Py_XINCREF(indexgroup);
1483 self->indexgroup = indexgroup;
1484
1485 self->weakreflist = NULL;
1486
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001487 if (!_validate(self)) {
1488 Py_DECREF(self);
1489 return NULL;
1490 }
1491
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001492 return (PyObject*) self;
1493}
1494
Guido van Rossumb700df92000-03-31 14:59:30 +00001495/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001496/* Code validation */
1497
1498/* To learn more about this code, have a look at the _compile() function in
1499 Lib/sre_compile.py. The validation functions below checks the code array
1500 for conformance with the code patterns generated there.
1501
1502 The nice thing about the generated code is that it is position-independent:
1503 all jumps are relative jumps forward. Also, jumps don't cross each other:
1504 the target of a later jump is always earlier than the target of an earlier
1505 jump. IOW, this is okay:
1506
1507 J---------J-------T--------T
1508 \ \_____/ /
1509 \______________________/
1510
1511 but this is not:
1512
1513 J---------J-------T--------T
1514 \_________\_____/ /
1515 \____________/
1516
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001517 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001518*/
1519
1520/* Defining this one enables tracing of the validator */
1521#undef VVERBOSE
1522
1523/* Trace macro for the validator */
1524#if defined(VVERBOSE)
1525#define VTRACE(v) printf v
1526#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001527#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001528#endif
1529
1530/* Report failure */
1531#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1532
1533/* Extract opcode, argument, or skip count from code array */
1534#define GET_OP \
1535 do { \
1536 VTRACE(("%p: ", code)); \
1537 if (code >= end) FAIL; \
1538 op = *code++; \
1539 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1540 } while (0)
1541#define GET_ARG \
1542 do { \
1543 VTRACE(("%p= ", code)); \
1544 if (code >= end) FAIL; \
1545 arg = *code++; \
1546 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1547 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001548#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001549 do { \
1550 VTRACE(("%p= ", code)); \
1551 if (code >= end) FAIL; \
1552 skip = *code; \
1553 VTRACE(("%lu (skip to %p)\n", \
1554 (unsigned long)skip, code+skip)); \
Victor Stinner1fa174a2013-08-28 02:06:21 +02001555 if (skip-adj > (Py_uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001556 FAIL; \
1557 code++; \
1558 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001559#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001560
1561static int
1562_validate_charset(SRE_CODE *code, SRE_CODE *end)
1563{
1564 /* Some variables are manipulated by the macros above */
1565 SRE_CODE op;
1566 SRE_CODE arg;
1567 SRE_CODE offset;
1568 int i;
1569
1570 while (code < end) {
1571 GET_OP;
1572 switch (op) {
1573
1574 case SRE_OP_NEGATE:
1575 break;
1576
1577 case SRE_OP_LITERAL:
1578 GET_ARG;
1579 break;
1580
1581 case SRE_OP_RANGE:
1582 GET_ARG;
1583 GET_ARG;
1584 break;
1585
1586 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001587 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001588 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001589 FAIL;
1590 code += offset;
1591 break;
1592
1593 case SRE_OP_BIGCHARSET:
1594 GET_ARG; /* Number of blocks */
1595 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001596 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001597 FAIL;
1598 /* Make sure that each byte points to a valid block */
1599 for (i = 0; i < 256; i++) {
1600 if (((unsigned char *)code)[i] >= arg)
1601 FAIL;
1602 }
1603 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001604 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001605 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001606 FAIL;
1607 code += offset;
1608 break;
1609
1610 case SRE_OP_CATEGORY:
1611 GET_ARG;
1612 switch (arg) {
1613 case SRE_CATEGORY_DIGIT:
1614 case SRE_CATEGORY_NOT_DIGIT:
1615 case SRE_CATEGORY_SPACE:
1616 case SRE_CATEGORY_NOT_SPACE:
1617 case SRE_CATEGORY_WORD:
1618 case SRE_CATEGORY_NOT_WORD:
1619 case SRE_CATEGORY_LINEBREAK:
1620 case SRE_CATEGORY_NOT_LINEBREAK:
1621 case SRE_CATEGORY_LOC_WORD:
1622 case SRE_CATEGORY_LOC_NOT_WORD:
1623 case SRE_CATEGORY_UNI_DIGIT:
1624 case SRE_CATEGORY_UNI_NOT_DIGIT:
1625 case SRE_CATEGORY_UNI_SPACE:
1626 case SRE_CATEGORY_UNI_NOT_SPACE:
1627 case SRE_CATEGORY_UNI_WORD:
1628 case SRE_CATEGORY_UNI_NOT_WORD:
1629 case SRE_CATEGORY_UNI_LINEBREAK:
1630 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1631 break;
1632 default:
1633 FAIL;
1634 }
1635 break;
1636
1637 default:
1638 FAIL;
1639
1640 }
1641 }
1642
1643 return 1;
1644}
1645
1646static int
1647_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1648{
1649 /* Some variables are manipulated by the macros above */
1650 SRE_CODE op;
1651 SRE_CODE arg;
1652 SRE_CODE skip;
1653
1654 VTRACE(("code=%p, end=%p\n", code, end));
1655
1656 if (code > end)
1657 FAIL;
1658
1659 while (code < end) {
1660 GET_OP;
1661 switch (op) {
1662
1663 case SRE_OP_MARK:
1664 /* We don't check whether marks are properly nested; the
1665 sre_match() code is robust even if they don't, and the worst
1666 you can get is nonsensical match results. */
1667 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001668 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001669 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1670 FAIL;
1671 }
1672 break;
1673
1674 case SRE_OP_LITERAL:
1675 case SRE_OP_NOT_LITERAL:
1676 case SRE_OP_LITERAL_IGNORE:
1677 case SRE_OP_NOT_LITERAL_IGNORE:
1678 GET_ARG;
1679 /* The arg is just a character, nothing to check */
1680 break;
1681
1682 case SRE_OP_SUCCESS:
1683 case SRE_OP_FAILURE:
1684 /* Nothing to check; these normally end the matching process */
1685 break;
1686
1687 case SRE_OP_AT:
1688 GET_ARG;
1689 switch (arg) {
1690 case SRE_AT_BEGINNING:
1691 case SRE_AT_BEGINNING_STRING:
1692 case SRE_AT_BEGINNING_LINE:
1693 case SRE_AT_END:
1694 case SRE_AT_END_LINE:
1695 case SRE_AT_END_STRING:
1696 case SRE_AT_BOUNDARY:
1697 case SRE_AT_NON_BOUNDARY:
1698 case SRE_AT_LOC_BOUNDARY:
1699 case SRE_AT_LOC_NON_BOUNDARY:
1700 case SRE_AT_UNI_BOUNDARY:
1701 case SRE_AT_UNI_NON_BOUNDARY:
1702 break;
1703 default:
1704 FAIL;
1705 }
1706 break;
1707
1708 case SRE_OP_ANY:
1709 case SRE_OP_ANY_ALL:
1710 /* These have no operands */
1711 break;
1712
1713 case SRE_OP_IN:
1714 case SRE_OP_IN_IGNORE:
1715 GET_SKIP;
1716 /* Stop 1 before the end; we check the FAILURE below */
1717 if (!_validate_charset(code, code+skip-2))
1718 FAIL;
1719 if (code[skip-2] != SRE_OP_FAILURE)
1720 FAIL;
1721 code += skip-1;
1722 break;
1723
1724 case SRE_OP_INFO:
1725 {
1726 /* A minimal info field is
1727 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1728 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1729 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001730 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001731 SRE_CODE *newcode;
1732 GET_SKIP;
1733 newcode = code+skip-1;
1734 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001735 GET_ARG;
1736 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001737 /* Check that only valid flags are present */
1738 if ((flags & ~(SRE_INFO_PREFIX |
1739 SRE_INFO_LITERAL |
1740 SRE_INFO_CHARSET)) != 0)
1741 FAIL;
1742 /* PREFIX and CHARSET are mutually exclusive */
1743 if ((flags & SRE_INFO_PREFIX) &&
1744 (flags & SRE_INFO_CHARSET))
1745 FAIL;
1746 /* LITERAL implies PREFIX */
1747 if ((flags & SRE_INFO_LITERAL) &&
1748 !(flags & SRE_INFO_PREFIX))
1749 FAIL;
1750 /* Validate the prefix */
1751 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001752 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001753 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001754 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001755 /* Here comes the prefix string */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001756 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001757 FAIL;
1758 code += prefix_len;
1759 /* And here comes the overlap table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001760 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001761 FAIL;
1762 /* Each overlap value should be < prefix_len */
1763 for (i = 0; i < prefix_len; i++) {
1764 if (code[i] >= prefix_len)
1765 FAIL;
1766 }
1767 code += prefix_len;
1768 }
1769 /* Validate the charset */
1770 if (flags & SRE_INFO_CHARSET) {
1771 if (!_validate_charset(code, newcode-1))
1772 FAIL;
1773 if (newcode[-1] != SRE_OP_FAILURE)
1774 FAIL;
1775 code = newcode;
1776 }
1777 else if (code != newcode) {
1778 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1779 FAIL;
1780 }
1781 }
1782 break;
1783
1784 case SRE_OP_BRANCH:
1785 {
1786 SRE_CODE *target = NULL;
1787 for (;;) {
1788 GET_SKIP;
1789 if (skip == 0)
1790 break;
1791 /* Stop 2 before the end; we check the JUMP below */
1792 if (!_validate_inner(code, code+skip-3, groups))
1793 FAIL;
1794 code += skip-3;
1795 /* Check that it ends with a JUMP, and that each JUMP
1796 has the same target */
1797 GET_OP;
1798 if (op != SRE_OP_JUMP)
1799 FAIL;
1800 GET_SKIP;
1801 if (target == NULL)
1802 target = code+skip-1;
1803 else if (code+skip-1 != target)
1804 FAIL;
1805 }
1806 }
1807 break;
1808
1809 case SRE_OP_REPEAT_ONE:
1810 case SRE_OP_MIN_REPEAT_ONE:
1811 {
1812 SRE_CODE min, max;
1813 GET_SKIP;
1814 GET_ARG; min = arg;
1815 GET_ARG; max = arg;
1816 if (min > max)
1817 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001818 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001819 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001820 if (!_validate_inner(code, code+skip-4, groups))
1821 FAIL;
1822 code += skip-4;
1823 GET_OP;
1824 if (op != SRE_OP_SUCCESS)
1825 FAIL;
1826 }
1827 break;
1828
1829 case SRE_OP_REPEAT:
1830 {
1831 SRE_CODE min, max;
1832 GET_SKIP;
1833 GET_ARG; min = arg;
1834 GET_ARG; max = arg;
1835 if (min > max)
1836 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001837 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001838 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001839 if (!_validate_inner(code, code+skip-3, groups))
1840 FAIL;
1841 code += skip-3;
1842 GET_OP;
1843 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1844 FAIL;
1845 }
1846 break;
1847
1848 case SRE_OP_GROUPREF:
1849 case SRE_OP_GROUPREF_IGNORE:
1850 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001851 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001852 FAIL;
1853 break;
1854
1855 case SRE_OP_GROUPREF_EXISTS:
1856 /* The regex syntax for this is: '(?(group)then|else)', where
1857 'group' is either an integer group number or a group name,
1858 'then' and 'else' are sub-regexes, and 'else' is optional. */
1859 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001860 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001861 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001862 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001863 code--; /* The skip is relative to the first arg! */
1864 /* There are two possibilities here: if there is both a 'then'
1865 part and an 'else' part, the generated code looks like:
1866
1867 GROUPREF_EXISTS
1868 <group>
1869 <skipyes>
1870 ...then part...
1871 JUMP
1872 <skipno>
1873 (<skipyes> jumps here)
1874 ...else part...
1875 (<skipno> jumps here)
1876
1877 If there is only a 'then' part, it looks like:
1878
1879 GROUPREF_EXISTS
1880 <group>
1881 <skip>
1882 ...then part...
1883 (<skip> jumps here)
1884
1885 There is no direct way to decide which it is, and we don't want
1886 to allow arbitrary jumps anywhere in the code; so we just look
1887 for a JUMP opcode preceding our skip target.
1888 */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001889 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001890 code[skip-3] == SRE_OP_JUMP)
1891 {
1892 VTRACE(("both then and else parts present\n"));
1893 if (!_validate_inner(code+1, code+skip-3, groups))
1894 FAIL;
1895 code += skip-2; /* Position after JUMP, at <skipno> */
1896 GET_SKIP;
1897 if (!_validate_inner(code, code+skip-1, groups))
1898 FAIL;
1899 code += skip-1;
1900 }
1901 else {
1902 VTRACE(("only a then part present\n"));
1903 if (!_validate_inner(code+1, code+skip-1, groups))
1904 FAIL;
1905 code += skip-1;
1906 }
1907 break;
1908
1909 case SRE_OP_ASSERT:
1910 case SRE_OP_ASSERT_NOT:
1911 GET_SKIP;
1912 GET_ARG; /* 0 for lookahead, width for lookbehind */
1913 code--; /* Back up over arg to simplify math below */
1914 if (arg & 0x80000000)
1915 FAIL; /* Width too large */
1916 /* Stop 1 before the end; we check the SUCCESS below */
1917 if (!_validate_inner(code+1, code+skip-2, groups))
1918 FAIL;
1919 code += skip-2;
1920 GET_OP;
1921 if (op != SRE_OP_SUCCESS)
1922 FAIL;
1923 break;
1924
1925 default:
1926 FAIL;
1927
1928 }
1929 }
1930
1931 VTRACE(("okay\n"));
1932 return 1;
1933}
1934
1935static int
1936_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1937{
1938 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
1939 FAIL;
1940 if (groups == 0) /* fix for simplejson */
1941 groups = 100; /* 100 groups should always be safe */
1942 return _validate_inner(code, end-1, groups);
1943}
1944
1945static int
1946_validate(PatternObject *self)
1947{
1948 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1949 {
1950 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1951 return 0;
1952 }
1953 else
1954 VTRACE(("Success!\n"));
1955 return 1;
1956}
1957
1958/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001959/* match methods */
1960
1961static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001962match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001963{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001964 Py_XDECREF(self->regs);
1965 Py_XDECREF(self->string);
1966 Py_DECREF(self->pattern);
1967 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001968}
1969
1970static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001971match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001972{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001973 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001974 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001975 Py_buffer view;
1976 PyObject *result;
1977 void* ptr;
1978
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001979 if (index < 0 || index >= self->groups) {
1980 /* raise IndexError if we were given a bad group number */
1981 PyErr_SetString(
1982 PyExc_IndexError,
1983 "no such group"
1984 );
1985 return NULL;
1986 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001987
Fredrik Lundh6f013982000-07-03 18:44:21 +00001988 index *= 2;
1989
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001990 if (self->string == Py_None || self->mark[index] < 0) {
1991 /* return default value if the string or group is undefined */
1992 Py_INCREF(def);
1993 return def;
1994 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001995
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001996 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001997 if (ptr == NULL)
1998 return NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001999 result = getslice(isbytes, ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +03002000 self->string, self->mark[index], self->mark[index+1]);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002001 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03002002 PyBuffer_Release(&view);
2003 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002004}
2005
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002006static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002007match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002008{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002009 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002010
Guido van Rossumddefaf32007-01-14 03:31:43 +00002011 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002012 /* Default value */
2013 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00002014
Christian Heimes217cfd12007-12-02 14:31:20 +00002015 if (PyLong_Check(index))
2016 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002017
Fredrik Lundh6f013982000-07-03 18:44:21 +00002018 i = -1;
2019
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002020 if (self->pattern->groupindex) {
2021 index = PyObject_GetItem(self->pattern->groupindex, index);
2022 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00002023 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00002024 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002025 Py_DECREF(index);
2026 } else
2027 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002028 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002029
2030 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002031}
2032
2033static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002034match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002035{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002036 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002037}
2038
2039static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002040match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002041{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002042 /* delegate to Python code */
2043 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002044 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002045 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002046 );
2047}
2048
2049static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002050match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002051{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002053 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 switch (size) {
2058 case 0:
2059 result = match_getslice(self, Py_False, Py_None);
2060 break;
2061 case 1:
2062 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2063 break;
2064 default:
2065 /* fetch multiple items */
2066 result = PyTuple_New(size);
2067 if (!result)
2068 return NULL;
2069 for (i = 0; i < size; i++) {
2070 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002071 self, PyTuple_GET_ITEM(args, i), Py_None
2072 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 if (!item) {
2074 Py_DECREF(result);
2075 return NULL;
2076 }
2077 PyTuple_SET_ITEM(result, i, item);
2078 }
2079 break;
2080 }
2081 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002082}
2083
2084static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002085match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002086{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002087 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002088 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002089
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002090 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002091 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002092 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002093 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002094
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002095 result = PyTuple_New(self->groups-1);
2096 if (!result)
2097 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002098
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002099 for (index = 1; index < self->groups; index++) {
2100 PyObject* item;
2101 item = match_getslice_by_index(self, index, def);
2102 if (!item) {
2103 Py_DECREF(result);
2104 return NULL;
2105 }
2106 PyTuple_SET_ITEM(result, index-1, item);
2107 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002108
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002109 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002110}
2111
2112static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002113match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002114{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002115 PyObject* result;
2116 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002117 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002119 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002120 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002121 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002122 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002123
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002124 result = PyDict_New();
2125 if (!result || !self->pattern->groupindex)
2126 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002127
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002129 if (!keys)
2130 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002131
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002132 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002133 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002134 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002135 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002136 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002137 if (!key)
2138 goto failed;
2139 value = match_getslice(self, key, def);
2140 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002141 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002142 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002143 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002144 status = PyDict_SetItem(result, key, value);
2145 Py_DECREF(value);
2146 if (status < 0)
2147 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002148 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002149
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002150 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002151
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002152 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002153
2154failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002155 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002156 Py_DECREF(result);
2157 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002158}
2159
2160static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002161match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002162{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002163 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002164
Guido van Rossumddefaf32007-01-14 03:31:43 +00002165 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002166 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002167 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002168
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002169 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002170
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002171 if (index < 0 || index >= self->groups) {
2172 PyErr_SetString(
2173 PyExc_IndexError,
2174 "no such group"
2175 );
2176 return NULL;
2177 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002178
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002179 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002180 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002181}
2182
2183static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002184match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002185{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002186 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002187
Guido van Rossumddefaf32007-01-14 03:31:43 +00002188 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002189 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002190 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002191
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002192 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002193
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002194 if (index < 0 || index >= self->groups) {
2195 PyErr_SetString(
2196 PyExc_IndexError,
2197 "no such group"
2198 );
2199 return NULL;
2200 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002201
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002202 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002203 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204}
2205
2206LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002207_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002208{
2209 PyObject* pair;
2210 PyObject* item;
2211
2212 pair = PyTuple_New(2);
2213 if (!pair)
2214 return NULL;
2215
Christian Heimes217cfd12007-12-02 14:31:20 +00002216 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002217 if (!item)
2218 goto error;
2219 PyTuple_SET_ITEM(pair, 0, item);
2220
Christian Heimes217cfd12007-12-02 14:31:20 +00002221 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002222 if (!item)
2223 goto error;
2224 PyTuple_SET_ITEM(pair, 1, item);
2225
2226 return pair;
2227
2228 error:
2229 Py_DECREF(pair);
2230 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002231}
2232
2233static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002234match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002235{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002236 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002237
Guido van Rossumddefaf32007-01-14 03:31:43 +00002238 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002239 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002240 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002241
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002242 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002243
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002244 if (index < 0 || index >= self->groups) {
2245 PyErr_SetString(
2246 PyExc_IndexError,
2247 "no such group"
2248 );
2249 return NULL;
2250 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002251
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002252 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002253 return _pair(self->mark[index*2], self->mark[index*2+1]);
2254}
2255
2256static PyObject*
2257match_regs(MatchObject* self)
2258{
2259 PyObject* regs;
2260 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002261 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002262
2263 regs = PyTuple_New(self->groups);
2264 if (!regs)
2265 return NULL;
2266
2267 for (index = 0; index < self->groups; index++) {
2268 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2269 if (!item) {
2270 Py_DECREF(regs);
2271 return NULL;
2272 }
2273 PyTuple_SET_ITEM(regs, index, item);
2274 }
2275
2276 Py_INCREF(regs);
2277 self->regs = regs;
2278
2279 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002280}
2281
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002282static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002283match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002284{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002285#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002286 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002287 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00002288
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002289 slots = 2 * (self->pattern->groups+1);
2290
2291 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2292 if (!copy)
2293 return NULL;
2294
2295 /* this value a constant, but any compiler should be able to
2296 figure that out all by itself */
2297 offset = offsetof(MatchObject, string);
2298
2299 Py_XINCREF(self->pattern);
2300 Py_XINCREF(self->string);
2301 Py_XINCREF(self->regs);
2302
2303 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002304 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002305
2306 return (PyObject*) copy;
2307#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002308 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002309 return NULL;
2310#endif
2311}
2312
2313static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002314match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002315{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002316#ifdef USE_BUILTIN_COPY
2317 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002318
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002319 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002320 if (!copy)
2321 return NULL;
2322
2323 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2324 !deepcopy(&copy->string, memo) ||
2325 !deepcopy(&copy->regs, memo)) {
2326 Py_DECREF(copy);
2327 return NULL;
2328 }
2329
2330#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002331 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2332 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002333#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002334}
2335
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002336PyDoc_STRVAR(match_doc,
2337"The result of re.match() and re.search().\n\
2338Match objects always have a boolean value of True.");
2339
2340PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002341"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002342 Return subgroup(s) of the match by indices or names.\n\
2343 For 0 returns the entire match.");
2344
2345PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002346"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002347 Return index of the start of the substring matched by group.");
2348
2349PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002350"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002351 Return index of the end of the substring matched by group.");
2352
2353PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002354"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002355 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
2356
2357PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002358"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002359 Return a tuple containing all the subgroups of the match, from 1.\n\
2360 The default argument is used for groups\n\
2361 that did not participate in the match");
2362
2363PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002364"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002365 Return a dictionary containing all the named subgroups of the match,\n\
2366 keyed by the subgroup name. The default argument is used for groups\n\
2367 that did not participate in the match");
2368
2369PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002370"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002371 Return the string obtained by doing backslash substitution\n\
2372 on the string template, as done by the sub() method.");
2373
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002374static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002375 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2376 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
2377 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
2378 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
2379 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
2380 match_groups_doc},
2381 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
2382 match_groupdict_doc},
2383 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002384 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
2385 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002386 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002387};
2388
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002389static PyObject *
2390match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002391{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002392 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002393 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002394 Py_INCREF(Py_None);
2395 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00002396}
2397
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002398static PyObject *
2399match_lastgroup_get(MatchObject *self)
2400{
2401 if (self->pattern->indexgroup && self->lastindex >= 0) {
2402 PyObject* result = PySequence_GetItem(
2403 self->pattern->indexgroup, self->lastindex
2404 );
2405 if (result)
2406 return result;
2407 PyErr_Clear();
2408 }
2409 Py_INCREF(Py_None);
2410 return Py_None;
2411}
2412
2413static PyObject *
2414match_regs_get(MatchObject *self)
2415{
2416 if (self->regs) {
2417 Py_INCREF(self->regs);
2418 return self->regs;
2419 } else
2420 return match_regs(self);
2421}
2422
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002423static PyObject *
2424match_repr(MatchObject *self)
2425{
2426 PyObject *result;
2427 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2428 if (group0 == NULL)
2429 return NULL;
2430 result = PyUnicode_FromFormat(
2431 "<%s object; span=(%d, %d), match=%.50R>",
2432 Py_TYPE(self)->tp_name,
2433 self->mark[0], self->mark[1], group0);
2434 Py_DECREF(group0);
2435 return result;
2436}
2437
2438
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002439static PyGetSetDef match_getset[] = {
2440 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
2441 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
2442 {"regs", (getter)match_regs_get, (setter)NULL},
2443 {NULL}
2444};
2445
2446#define MATCH_OFF(x) offsetof(MatchObject, x)
2447static PyMemberDef match_members[] = {
2448 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
2449 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
2450 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
2451 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
2452 {NULL}
2453};
2454
Guido van Rossumb700df92000-03-31 14:59:30 +00002455/* FIXME: implement setattr("string", None) as a special case (to
2456 detach the associated string, if any */
2457
Neal Norwitz57c179c2006-03-22 07:18:02 +00002458static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002459 PyVarObject_HEAD_INIT(NULL,0)
2460 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002461 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002462 (destructor)match_dealloc, /* tp_dealloc */
2463 0, /* tp_print */
2464 0, /* tp_getattr */
2465 0, /* tp_setattr */
2466 0, /* tp_reserved */
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002467 (reprfunc)match_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002468 0, /* tp_as_number */
2469 0, /* tp_as_sequence */
2470 0, /* tp_as_mapping */
2471 0, /* tp_hash */
2472 0, /* tp_call */
2473 0, /* tp_str */
2474 0, /* tp_getattro */
2475 0, /* tp_setattro */
2476 0, /* tp_as_buffer */
2477 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002478 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002479 0, /* tp_traverse */
2480 0, /* tp_clear */
2481 0, /* tp_richcompare */
2482 0, /* tp_weaklistoffset */
2483 0, /* tp_iter */
2484 0, /* tp_iternext */
2485 match_methods, /* tp_methods */
2486 match_members, /* tp_members */
2487 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002488};
2489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490static PyObject*
Victor Stinnerf5587782013-11-15 23:21:11 +01002491pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002492{
2493 /* create match object (from state object) */
2494
2495 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002496 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002497 char* base;
2498 int n;
2499
2500 if (status > 0) {
2501
2502 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002503 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002504 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
2505 2*(pattern->groups+1));
2506 if (!match)
2507 return NULL;
2508
2509 Py_INCREF(pattern);
2510 match->pattern = pattern;
2511
2512 Py_INCREF(state->string);
2513 match->string = state->string;
2514
2515 match->regs = NULL;
2516 match->groups = pattern->groups+1;
2517
2518 /* fill in group slices */
2519
2520 base = (char*) state->beginning;
2521 n = state->charsize;
2522
2523 match->mark[0] = ((char*) state->start - base) / n;
2524 match->mark[1] = ((char*) state->ptr - base) / n;
2525
2526 for (i = j = 0; i < pattern->groups; i++, j+=2)
2527 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2528 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2529 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2530 } else
2531 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2532
2533 match->pos = state->pos;
2534 match->endpos = state->endpos;
2535
2536 match->lastindex = state->lastindex;
2537
2538 return (PyObject*) match;
2539
2540 } else if (status == 0) {
2541
2542 /* no match */
2543 Py_INCREF(Py_None);
2544 return Py_None;
2545
2546 }
2547
2548 /* internal error */
2549 pattern_error(status);
2550 return NULL;
2551}
2552
2553
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002554/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002555/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002556
2557static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002558scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002559{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002560 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002561 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002562 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002563}
2564
2565static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002566scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002567{
2568 SRE_STATE* state = &self->state;
2569 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002570 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002571
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002572 if (state->start == NULL)
2573 Py_RETURN_NONE;
2574
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002575 state_reset(state);
2576
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002577 state->ptr = state->start;
2578
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03002579 status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
Thomas Wouters89f507f2006-12-13 04:49:30 +00002580 if (PyErr_Occurred())
2581 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002582
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002583 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002584 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002585
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002586 if (status == 0)
2587 state->start = NULL;
2588 else if (state->ptr != state->start)
2589 state->start = state->ptr;
2590 else if (state->ptr != state->end)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002591 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002592 else
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002593 state->start = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002594
2595 return match;
2596}
2597
2598
2599static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002600scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002601{
2602 SRE_STATE* state = &self->state;
2603 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002604 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002605
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002606 if (state->start == NULL)
2607 Py_RETURN_NONE;
2608
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002609 state_reset(state);
2610
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002611 state->ptr = state->start;
2612
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002613 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002614 if (PyErr_Occurred())
2615 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002616
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002617 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002618 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002619
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002620 if (status == 0)
2621 state->start = NULL;
2622 else if (state->ptr != state->start)
2623 state->start = state->ptr;
2624 else if (state->ptr != state->end)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002625 state->start = (void*) ((char*) state->ptr + state->charsize);
2626 else
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002627 state->start = NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002628
2629 return match;
2630}
2631
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002632static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002633 {"match", (PyCFunction) scanner_match, METH_NOARGS},
2634 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002635 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002636};
2637
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002638#define SCAN_OFF(x) offsetof(ScannerObject, x)
2639static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03002640 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002641 {NULL} /* Sentinel */
2642};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002643
Neal Norwitz57c179c2006-03-22 07:18:02 +00002644static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002645 PyVarObject_HEAD_INIT(NULL, 0)
2646 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002647 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002648 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002649 0, /* tp_print */
2650 0, /* tp_getattr */
2651 0, /* tp_setattr */
2652 0, /* tp_reserved */
2653 0, /* tp_repr */
2654 0, /* tp_as_number */
2655 0, /* tp_as_sequence */
2656 0, /* tp_as_mapping */
2657 0, /* tp_hash */
2658 0, /* tp_call */
2659 0, /* tp_str */
2660 0, /* tp_getattro */
2661 0, /* tp_setattro */
2662 0, /* tp_as_buffer */
2663 Py_TPFLAGS_DEFAULT, /* tp_flags */
2664 0, /* tp_doc */
2665 0, /* tp_traverse */
2666 0, /* tp_clear */
2667 0, /* tp_richcompare */
2668 0, /* tp_weaklistoffset */
2669 0, /* tp_iter */
2670 0, /* tp_iternext */
2671 scanner_methods, /* tp_methods */
2672 scanner_members, /* tp_members */
2673 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002674};
2675
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002676static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002677pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002678{
2679 /* create search state object */
2680
2681 ScannerObject* self;
2682
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02002683 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002684 Py_ssize_t start = 0;
2685 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02002686 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2687 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:scanner", kwlist,
2688 &string, &start, &end, &string2))
2689 return NULL;
2690
2691 string = fix_string_param(string, string2, "source");
2692 if (!string)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002693 return NULL;
2694
2695 /* create scanner object */
2696 self = PyObject_NEW(ScannerObject, &Scanner_Type);
2697 if (!self)
2698 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002699 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002700
2701 string = state_init(&self->state, pattern, string, start, end);
2702 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002703 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002704 return NULL;
2705 }
2706
2707 Py_INCREF(pattern);
2708 self->pattern = (PyObject*) pattern;
2709
2710 return (PyObject*) self;
2711}
2712
Guido van Rossumb700df92000-03-31 14:59:30 +00002713static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00002714 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002715 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00002716 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002717 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002718};
2719
Martin v. Löwis1a214512008-06-11 05:26:20 +00002720static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002721 PyModuleDef_HEAD_INIT,
2722 "_" SRE_MODULE,
2723 NULL,
2724 -1,
2725 _functions,
2726 NULL,
2727 NULL,
2728 NULL,
2729 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00002730};
2731
2732PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002733{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002734 PyObject* m;
2735 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002736 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002737
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00002738 /* Patch object types */
2739 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
2740 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00002741 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002742
Martin v. Löwis1a214512008-06-11 05:26:20 +00002743 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00002744 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002745 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002746 d = PyModule_GetDict(m);
2747
Christian Heimes217cfd12007-12-02 14:31:20 +00002748 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002749 if (x) {
2750 PyDict_SetItemString(d, "MAGIC", x);
2751 Py_DECREF(x);
2752 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002753
Christian Heimes217cfd12007-12-02 14:31:20 +00002754 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00002755 if (x) {
2756 PyDict_SetItemString(d, "CODESIZE", x);
2757 Py_DECREF(x);
2758 }
2759
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02002760 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
2761 if (x) {
2762 PyDict_SetItemString(d, "MAXREPEAT", x);
2763 Py_DECREF(x);
2764 }
2765
Neal Norwitzfe537132007-08-26 03:55:15 +00002766 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002767 if (x) {
2768 PyDict_SetItemString(d, "copyright", x);
2769 Py_DECREF(x);
2770 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00002771 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00002772}
2773
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002774/* vim:ts=4:sw=4:et
2775*/