blob: eb1106ad8055055134e1de8843941c9532964d26 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000038static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000044#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000045
46#include "sre.h"
47
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030048#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
49
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000050#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000051
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000053#if !defined(SRE_MODULE)
54#define SRE_MODULE "sre"
55#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000056
Thomas Wouters9ada3d62006-04-21 09:47:09 +000057#define SRE_PY_MODULE "re"
58
Guido van Rossumb700df92000-03-31 14:59:30 +000059/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000060#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000063/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064
65/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000066#define USE_FAST_SEARCH
67
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000068/* enables copy/deepcopy handling (work in progress) */
69#undef USE_BUILTIN_COPY
70
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071/* -------------------------------------------------------------------- */
72
Fredrik Lundh80946112000-06-29 18:03:25 +000073#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000074#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000075#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000076/* fastest possible local call under MSVC */
77#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000078#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000079#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#else
81#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000082#endif
83
84/* error codes */
85#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000086#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000087#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000088#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000089#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000090
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000092#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000093#else
94#define TRACE(v)
95#endif
96
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000097/* -------------------------------------------------------------------- */
98/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000100/* default character predicates (run sre_chars.py to regenerate tables) */
101
102#define SRE_DIGIT_MASK 1
103#define SRE_SPACE_MASK 2
104#define SRE_LINEBREAK_MASK 4
105#define SRE_ALNUM_MASK 8
106#define SRE_WORD_MASK 16
107
Fredrik Lundh21009b92001-09-18 18:47:09 +0000108/* FIXME: this assumes ASCII. create tables in init_sre() instead */
109
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000110static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1112, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11325, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1150, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
117
Fredrik Lundhb389df32000-06-29 12:48:37 +0000118static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000011910, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12027, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12144, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12261, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
123108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
124122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
125106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
126120, 121, 122, 123, 124, 125, 126, 127 };
127
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000128#define SRE_IS_DIGIT(ch)\
129 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
130#define SRE_IS_SPACE(ch)\
131 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
132#define SRE_IS_LINEBREAK(ch)\
133 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
134#define SRE_IS_ALNUM(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
136#define SRE_IS_WORD(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000138
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000139static unsigned int sre_lower(unsigned int ch)
140{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000141 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000142}
143
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000144/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000145/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
146 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000148#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
149
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150static unsigned int sre_lower_locale(unsigned int ch)
151{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000152 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000153}
154
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000155/* unicode-specific character predicates */
156
Victor Stinner0058b862011-09-29 03:27:47 +0200157#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
158#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
159#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
160#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
161#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162
163static unsigned int sre_lower_unicode(unsigned int ch)
164{
Victor Stinner0058b862011-09-29 03:27:47 +0200165 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166}
167
Guido van Rossumb700df92000-03-31 14:59:30 +0000168LOCAL(int)
169sre_category(SRE_CODE category, unsigned int ch)
170{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000171 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000172
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000173 case SRE_CATEGORY_DIGIT:
174 return SRE_IS_DIGIT(ch);
175 case SRE_CATEGORY_NOT_DIGIT:
176 return !SRE_IS_DIGIT(ch);
177 case SRE_CATEGORY_SPACE:
178 return SRE_IS_SPACE(ch);
179 case SRE_CATEGORY_NOT_SPACE:
180 return !SRE_IS_SPACE(ch);
181 case SRE_CATEGORY_WORD:
182 return SRE_IS_WORD(ch);
183 case SRE_CATEGORY_NOT_WORD:
184 return !SRE_IS_WORD(ch);
185 case SRE_CATEGORY_LINEBREAK:
186 return SRE_IS_LINEBREAK(ch);
187 case SRE_CATEGORY_NOT_LINEBREAK:
188 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 case SRE_CATEGORY_LOC_WORD:
191 return SRE_LOC_IS_WORD(ch);
192 case SRE_CATEGORY_LOC_NOT_WORD:
193 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195 case SRE_CATEGORY_UNI_DIGIT:
196 return SRE_UNI_IS_DIGIT(ch);
197 case SRE_CATEGORY_UNI_NOT_DIGIT:
198 return !SRE_UNI_IS_DIGIT(ch);
199 case SRE_CATEGORY_UNI_SPACE:
200 return SRE_UNI_IS_SPACE(ch);
201 case SRE_CATEGORY_UNI_NOT_SPACE:
202 return !SRE_UNI_IS_SPACE(ch);
203 case SRE_CATEGORY_UNI_WORD:
204 return SRE_UNI_IS_WORD(ch);
205 case SRE_CATEGORY_UNI_NOT_WORD:
206 return !SRE_UNI_IS_WORD(ch);
207 case SRE_CATEGORY_UNI_LINEBREAK:
208 return SRE_UNI_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
210 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 }
212 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000213}
214
215/* helpers */
216
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000217static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000218data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000219{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000220 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000223 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000224 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000225}
226
227static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000228data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000229{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000230 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 minsize = state->data_stack_base+size;
232 cursize = state->data_stack_size;
233 if (cursize < minsize) {
234 void* stack;
235 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300236 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000239 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000240 return SRE_ERROR_MEMORY;
241 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000242 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000243 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000244 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000246}
247
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000248/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000249
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300250#define SRE_CHAR Py_UCS1
251#define SIZEOF_SRE_CHAR 1
252#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300253#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000254
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300255/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000256
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300257#define SRE_CHAR Py_UCS2
258#define SIZEOF_SRE_CHAR 2
259#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300260#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000261
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300262/* generate 32-bit unicode version */
263
264#define SRE_CHAR Py_UCS4
265#define SIZEOF_SRE_CHAR 4
266#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300267#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000268
269/* -------------------------------------------------------------------- */
270/* factories and destructors */
271
272/* see sre.h for object declarations */
Victor Stinnerf5587782013-11-15 23:21:11 +0100273static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600274static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +0000275
276static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000277sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +0000278{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +0100279 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000282static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +0000283sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000284{
285 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000286 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000287 return NULL;
288 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000289 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000290 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000291 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +0000292 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293}
294
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000295LOCAL(void)
296state_reset(SRE_STATE* state)
297{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000298 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000299 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000300
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000301 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000302 state->lastindex = -1;
303
304 state->repeat = NULL;
305
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000306 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000307}
308
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000309static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200310getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300311 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600312 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000313{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000314 /* given a python object, return a data pointer, a length (in
315 characters), and a character size. return NULL if the object
316 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000317
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000318 /* Unicode objects do not support the buffer API. So, get the data
319 directly instead. */
320 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 if (PyUnicode_READY(string) == -1)
322 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200323 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200324 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300325 *p_isbytes = 0;
326 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000327 }
328
Victor Stinner0058b862011-09-29 03:27:47 +0200329 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300330 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
331 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
332 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000333 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000334
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300335 *p_length = view->len;
336 *p_charsize = 1;
337 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000338
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300339 if (view->buf == NULL) {
340 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
341 PyBuffer_Release(view);
342 view->buf = NULL;
343 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300345 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000346}
347
348LOCAL(PyObject*)
349state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000350 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000351{
352 /* prepare state object */
353
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000354 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300355 int isbytes, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000356 void* ptr;
357
358 memset(state, 0, sizeof(SRE_STATE));
359
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000360 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000361 state->lastindex = -1;
362
Benjamin Petersone48944b2012-03-07 14:50:25 -0600363 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300364 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000365 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600366 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000367
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300368 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600369 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300370 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600371 goto err;
372 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300373 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600374 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300375 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600376 goto err;
377 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000378
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000379 /* adjust boundaries */
380 if (start < 0)
381 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000382 else if (start > length)
383 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000384
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000385 if (end < 0)
386 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000387 else if (end > length)
388 end = length;
389
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300390 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000391 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 state->start = (void*) ((char*) ptr + start * state->charsize);
396 state->end = (void*) ((char*) ptr + end * state->charsize);
397
398 Py_INCREF(string);
399 state->string = string;
400 state->pos = start;
401 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000402
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000403 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000404 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000405 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000406 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000407 else
Fredrik Lundhb389df32000-06-29 12:48:37 +0000408 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000410 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600411 err:
412 if (state->buffer.buf)
413 PyBuffer_Release(&state->buffer);
414 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000415}
416
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000417LOCAL(void)
418state_fini(SRE_STATE* state)
419{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600420 if (state->buffer.buf)
421 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000423 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000424}
425
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000426/* calculate offset from start of string */
427#define STATE_OFFSET(state, member)\
428 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
429
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000430LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300431getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300432 PyObject* string, Py_ssize_t start, Py_ssize_t end)
433{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300434 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300435 if (PyBytes_CheckExact(string) &&
436 start == 0 && end == PyBytes_GET_SIZE(string)) {
437 Py_INCREF(string);
438 return string;
439 }
440 return PyBytes_FromStringAndSize(
441 (const char *)ptr + start, end - start);
442 }
443 else {
444 return PyUnicode_Substring(string, start, end);
445 }
446}
447
448LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000449state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000451 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000452
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000453 index = (index - 1) * 2;
454
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000455 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000456 if (empty)
457 /* want empty string */
458 i = j = 0;
459 else {
460 Py_INCREF(Py_None);
461 return Py_None;
462 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000463 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000464 i = STATE_OFFSET(state, state->mark[index]);
465 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000466 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000467
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300468 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000469}
470
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000471static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100472pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000473{
474 switch (status) {
475 case SRE_ERROR_RECURSION_LIMIT:
476 PyErr_SetString(
477 PyExc_RuntimeError,
478 "maximum recursion limit exceeded"
479 );
480 break;
481 case SRE_ERROR_MEMORY:
482 PyErr_NoMemory();
483 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000484 case SRE_ERROR_INTERRUPTED:
485 /* An exception has already been raised, so let it fly */
486 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000487 default:
488 /* other error codes indicate compiler/engine bugs */
489 PyErr_SetString(
490 PyExc_RuntimeError,
491 "internal error in regular expression engine"
492 );
493 }
494}
495
Guido van Rossumb700df92000-03-31 14:59:30 +0000496static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000497pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000498{
Raymond Hettinger027bb632004-05-31 03:09:25 +0000499 if (self->weakreflist != NULL)
500 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 Py_XDECREF(self->pattern);
502 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000503 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +0000505}
506
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300507LOCAL(Py_ssize_t)
508sre_match(SRE_STATE* state, SRE_CODE* pattern)
509{
510 if (state->charsize == 1)
511 return sre_ucs1_match(state, pattern);
512 if (state->charsize == 2)
513 return sre_ucs2_match(state, pattern);
514 assert(state->charsize == 4);
515 return sre_ucs4_match(state, pattern);
516}
517
518LOCAL(Py_ssize_t)
519sre_search(SRE_STATE* state, SRE_CODE* pattern)
520{
521 if (state->charsize == 1)
522 return sre_ucs1_search(state, pattern);
523 if (state->charsize == 2)
524 return sre_ucs2_search(state, pattern);
525 assert(state->charsize == 4);
526 return sre_ucs4_search(state, pattern);
527}
528
Larry Hastings16c51912014-01-07 11:53:01 -0800529static PyObject *
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200530fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
531{
532 if (string2 != NULL) {
533 if (string != NULL) {
534 PyErr_Format(PyExc_TypeError,
535 "Argument given by name ('%s') and position (1)",
536 oldname);
537 return NULL;
538 }
539 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
540 "The '%s' keyword parameter name is deprecated. "
541 "Use 'string' instead.", oldname) < 0)
542 return NULL;
543 return string2;
544 }
545 if (string == NULL) {
546 PyErr_SetString(PyExc_TypeError,
547 "Required argument 'string' (pos 1) not found");
548 return NULL;
549 }
550 return string;
551}
Larry Hastings16c51912014-01-07 11:53:01 -0800552
553static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -0800554pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
Larry Hastings16c51912014-01-07 11:53:01 -0800555{
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200556 static char *_keywords[] = {"string", "pos", "endpos", "pattern", NULL};
557 PyObject *string = NULL;
Larry Hastings16c51912014-01-07 11:53:01 -0800558 Py_ssize_t pos = 0;
559 Py_ssize_t endpos = PY_SSIZE_T_MAX;
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200560 PyObject *pattern = NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000561 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100562 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +0000563
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200564 if (!PyArg_ParseTupleAndKeywords(args, kwargs,
565 "|Onn$O:match", _keywords,
566 &string, &pos, &endpos, &pattern))
567 return NULL;
568 string = fix_string_param(string, pattern, "pattern");
569 if (!string)
570 return NULL;
571 string = state_init(&state, (PatternObject *)self, string, pos, endpos);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000572 if (!string)
573 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000574
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000575 state.ptr = state.start;
576
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000577 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
578
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300579 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000580
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000581 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +0000582 if (PyErr_Occurred())
583 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000585 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000586
Larry Hastings16c51912014-01-07 11:53:01 -0800587 return (PyObject *)pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +0000588}
589
590static PyObject*
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200591pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
592{
593 SRE_STATE state;
594 Py_ssize_t status;
595
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200596 PyObject *string = NULL, *string2 = NULL;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200597 Py_ssize_t start = 0;
598 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200599 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200600 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:fullmatch", kwlist,
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200601 &string, &start, &end, &string2))
602 return NULL;
603
604 string = fix_string_param(string, string2, "pattern");
605 if (!string)
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200606 return NULL;
607
608 string = state_init(&state, self, string, start, end);
609 if (!string)
610 return NULL;
611
612 state.match_all = 1;
613 state.ptr = state.start;
614
615 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
616
617 status = sre_match(&state, PatternObject_GetCode(self));
618
619 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
620 if (PyErr_Occurred())
621 return NULL;
622
623 state_fini(&state);
624
625 return pattern_new_match(self, &state, status);
626}
627
628static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000629pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000630{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000631 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100632 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +0000633
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200634 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000635 Py_ssize_t start = 0;
636 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200637 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
638 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:search", kwlist,
639 &string, &start, &end, &string2))
640 return NULL;
641
642 string = fix_string_param(string, string2, "pattern");
643 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000644 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000645
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000646 string = state_init(&state, self, string, start, end);
647 if (!string)
648 return NULL;
649
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000650 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
651
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300652 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000653
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000654 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
655
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000656 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000657
Thomas Wouters89f507f2006-12-13 04:49:30 +0000658 if (PyErr_Occurred())
659 return NULL;
660
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000661 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +0000662}
663
664static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000665call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000666{
667 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000668 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000669 PyObject* func;
670 PyObject* result;
671
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000672 if (!args)
673 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000674 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000675 if (!name)
676 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000677 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000678 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000679 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000680 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000681 func = PyObject_GetAttrString(mod, function);
682 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000683 if (!func)
684 return NULL;
685 result = PyObject_CallObject(func, args);
686 Py_DECREF(func);
687 Py_DECREF(args);
688 return result;
689}
690
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000691#ifdef USE_BUILTIN_COPY
692static int
693deepcopy(PyObject** object, PyObject* memo)
694{
695 PyObject* copy;
696
697 copy = call(
698 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +0000699 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000700 );
701 if (!copy)
702 return 0;
703
704 Py_DECREF(*object);
705 *object = copy;
706
707 return 1; /* success */
708}
709#endif
710
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000711static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000712pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000713{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000714 SRE_STATE state;
715 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100716 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000717 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000718
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200719 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000720 Py_ssize_t start = 0;
721 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200722 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
723 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:findall", kwlist,
724 &string, &start, &end, &string2))
725 return NULL;
726
727 string = fix_string_param(string, string2, "source");
728 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000729 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000730
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000731 string = state_init(&state, self, string, start, end);
732 if (!string)
733 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000734
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000735 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000736 if (!list) {
737 state_fini(&state);
738 return NULL;
739 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000740
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000741 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000742
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000743 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000744
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000745 state_reset(&state);
746
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000747 state.ptr = state.start;
748
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300749 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300750 if (PyErr_Occurred())
751 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000752
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000753 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000754 if (status == 0)
755 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000756 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000757 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000758 }
Tim Peters3d563502006-01-21 02:47:53 +0000759
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000760 /* don't bother to build a match object */
761 switch (self->groups) {
762 case 0:
763 b = STATE_OFFSET(&state, state.start);
764 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300765 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300766 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000767 if (!item)
768 goto error;
769 break;
770 case 1:
771 item = state_getslice(&state, 1, string, 1);
772 if (!item)
773 goto error;
774 break;
775 default:
776 item = PyTuple_New(self->groups);
777 if (!item)
778 goto error;
779 for (i = 0; i < self->groups; i++) {
780 PyObject* o = state_getslice(&state, i+1, string, 1);
781 if (!o) {
782 Py_DECREF(item);
783 goto error;
784 }
785 PyTuple_SET_ITEM(item, i, o);
786 }
787 break;
788 }
789
790 status = PyList_Append(list, item);
791 Py_DECREF(item);
792 if (status < 0)
793 goto error;
794
795 if (state.ptr == state.start)
796 state.start = (void*) ((char*) state.ptr + state.charsize);
797 else
798 state.start = state.ptr;
799
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000800 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000801
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000802 state_fini(&state);
803 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000804
805error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000806 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000807 state_fini(&state);
808 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000809
Guido van Rossumb700df92000-03-31 14:59:30 +0000810}
811
Fredrik Lundh703ce812001-10-24 22:16:30 +0000812static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600813pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +0000814{
815 PyObject* scanner;
816 PyObject* search;
817 PyObject* iterator;
818
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600819 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000820 if (!scanner)
821 return NULL;
822
823 search = PyObject_GetAttrString(scanner, "search");
824 Py_DECREF(scanner);
825 if (!search)
826 return NULL;
827
828 iterator = PyCallIter_New(search, Py_None);
829 Py_DECREF(search);
830
831 return iterator;
832}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000833
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000834static PyObject*
835pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
836{
837 SRE_STATE state;
838 PyObject* list;
839 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100840 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000841 Py_ssize_t n;
842 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000843 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000844
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200845 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000846 Py_ssize_t maxsplit = 0;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200847 static char* kwlist[] = { "string", "maxsplit", "source", NULL };
848 if (!PyArg_ParseTupleAndKeywords(args, kw, "|On$O:split", kwlist,
849 &string, &maxsplit, &string2))
850 return NULL;
851
852 string = fix_string_param(string, string2, "source");
853 if (!string)
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000854 return NULL;
855
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000856 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000857 if (!string)
858 return NULL;
859
860 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000861 if (!list) {
862 state_fini(&state);
863 return NULL;
864 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000865
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000866 n = 0;
867 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000868
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000869 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000870
871 state_reset(&state);
872
873 state.ptr = state.start;
874
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300875 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300876 if (PyErr_Occurred())
877 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000878
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000879 if (status <= 0) {
880 if (status == 0)
881 break;
882 pattern_error(status);
883 goto error;
884 }
Tim Peters3d563502006-01-21 02:47:53 +0000885
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000886 if (state.start == state.ptr) {
887 if (last == state.end)
888 break;
889 /* skip one character */
890 state.start = (void*) ((char*) state.ptr + state.charsize);
891 continue;
892 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000893
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000894 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300895 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000896 string, STATE_OFFSET(&state, last),
897 STATE_OFFSET(&state, state.start)
898 );
899 if (!item)
900 goto error;
901 status = PyList_Append(list, item);
902 Py_DECREF(item);
903 if (status < 0)
904 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000905
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000906 /* add groups (if any) */
907 for (i = 0; i < self->groups; i++) {
908 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000909 if (!item)
910 goto error;
911 status = PyList_Append(list, item);
912 Py_DECREF(item);
913 if (status < 0)
914 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000915 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000916
917 n = n + 1;
918
919 last = state.start = state.ptr;
920
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000921 }
922
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000923 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300924 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000925 string, STATE_OFFSET(&state, last), state.endpos
926 );
927 if (!item)
928 goto error;
929 status = PyList_Append(list, item);
930 Py_DECREF(item);
931 if (status < 0)
932 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000933
934 state_fini(&state);
935 return list;
936
937error:
938 Py_DECREF(list);
939 state_fini(&state);
940 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000941
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000942}
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000943
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000944static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000945pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000946 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000947{
948 SRE_STATE state;
949 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +0300950 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000951 PyObject* item;
952 PyObject* filter;
953 PyObject* args;
954 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000955 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100956 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000957 Py_ssize_t n;
958 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300959 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000960 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600961 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000962
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000963 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +0000964 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000965 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +0000966 Py_INCREF(filter);
967 filter_is_callable = 1;
968 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000969 /* if not callable, check if it's a literal string */
970 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600971 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300972 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000974 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300975 if (charsize == 1)
976 literal = memchr(ptr, '\\', n) == NULL;
977 else
978 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000979 } else {
980 PyErr_Clear();
981 literal = 0;
982 }
Benjamin Petersone48944b2012-03-07 14:50:25 -0600983 if (view.buf)
984 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000985 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000986 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000987 Py_INCREF(filter);
988 filter_is_callable = 0;
989 } else {
990 /* not a literal; hand it over to the template compiler */
991 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +0000992 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000993 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000994 );
995 if (!filter)
996 return NULL;
997 filter_is_callable = PyCallable_Check(filter);
998 }
Fredrik Lundhdac58492001-10-21 21:48:30 +0000999 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001001 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001002 if (!string) {
1003 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001004 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001005 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001006
1007 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001008 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001009 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001010 state_fini(&state);
1011 return NULL;
1012 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001013
1014 n = i = 0;
1015
1016 while (!count || n < count) {
1017
1018 state_reset(&state);
1019
1020 state.ptr = state.start;
1021
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001022 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001023 if (PyErr_Occurred())
1024 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001025
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001026 if (status <= 0) {
1027 if (status == 0)
1028 break;
1029 pattern_error(status);
1030 goto error;
1031 }
Tim Peters3d563502006-01-21 02:47:53 +00001032
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001033 b = STATE_OFFSET(&state, state.start);
1034 e = STATE_OFFSET(&state, state.ptr);
1035
1036 if (i < b) {
1037 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001038 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001039 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001040 if (!item)
1041 goto error;
1042 status = PyList_Append(list, item);
1043 Py_DECREF(item);
1044 if (status < 0)
1045 goto error;
1046
1047 } else if (i == b && i == e && n > 0)
1048 /* ignore empty match on latest position */
1049 goto next;
1050
1051 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001052 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001053 match = pattern_new_match(self, &state, 1);
1054 if (!match)
1055 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001056 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001057 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00001058 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001059 goto error;
1060 }
1061 item = PyObject_CallObject(filter, args);
1062 Py_DECREF(args);
1063 Py_DECREF(match);
1064 if (!item)
1065 goto error;
1066 } else {
1067 /* filter is literal string */
1068 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001069 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001070 }
1071
1072 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001073 if (item != Py_None) {
1074 status = PyList_Append(list, item);
1075 Py_DECREF(item);
1076 if (status < 0)
1077 goto error;
1078 }
Tim Peters3d563502006-01-21 02:47:53 +00001079
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001080 i = e;
1081 n = n + 1;
1082
1083next:
1084 /* move on */
1085 if (state.ptr == state.start)
1086 state.start = (void*) ((char*) state.ptr + state.charsize);
1087 else
1088 state.start = state.ptr;
1089
1090 }
1091
1092 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001093 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001094 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001095 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001096 if (!item)
1097 goto error;
1098 status = PyList_Append(list, item);
1099 Py_DECREF(item);
1100 if (status < 0)
1101 goto error;
1102 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001103
1104 state_fini(&state);
1105
Guido van Rossum4e173842001-12-07 04:25:10 +00001106 Py_DECREF(filter);
1107
Fredrik Lundhdac58492001-10-21 21:48:30 +00001108 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001109 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001110 if (!joiner) {
1111 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001112 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001113 }
1114 if (PyList_GET_SIZE(list) == 0) {
1115 Py_DECREF(list);
1116 item = joiner;
1117 }
1118 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001119 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001120 item = _PyBytes_Join(joiner, list);
1121 else
1122 item = PyUnicode_Join(joiner, list);
1123 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001124 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001125 if (!item)
1126 return NULL;
1127 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001128
1129 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001130 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001131
1132 return item;
1133
1134error:
1135 Py_DECREF(list);
1136 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001137 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001138 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001139
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001140}
1141
1142static PyObject*
1143pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
1144{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001145 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001146 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001147 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001148 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001149 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001150 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001151 return NULL;
1152
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001153 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001154}
1155
1156static PyObject*
1157pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
1158{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001159 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001160 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001161 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001162 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001163 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001164 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001165 return NULL;
1166
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001167 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001168}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001169
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001170static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001171pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001172{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001173#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001174 PatternObject* copy;
1175 int offset;
1176
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001177 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1178 if (!copy)
1179 return NULL;
1180
1181 offset = offsetof(PatternObject, groups);
1182
1183 Py_XINCREF(self->groupindex);
1184 Py_XINCREF(self->indexgroup);
1185 Py_XINCREF(self->pattern);
1186
1187 memcpy((char*) copy + offset, (char*) self + offset,
1188 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00001189 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001190
1191 return (PyObject*) copy;
1192#else
1193 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1194 return NULL;
1195#endif
1196}
1197
1198static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001199pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001200{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001201#ifdef USE_BUILTIN_COPY
1202 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00001203
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001204 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001205 if (!copy)
1206 return NULL;
1207
1208 if (!deepcopy(&copy->groupindex, memo) ||
1209 !deepcopy(&copy->indexgroup, memo) ||
1210 !deepcopy(&copy->pattern, memo)) {
1211 Py_DECREF(copy);
1212 return NULL;
1213 }
1214
1215#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001216 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1217 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001218#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001219}
1220
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001221static PyObject *
1222pattern_repr(PatternObject *obj)
1223{
1224 static const struct {
1225 const char *name;
1226 int value;
1227 } flag_names[] = {
1228 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1229 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1230 {"re.LOCALE", SRE_FLAG_LOCALE},
1231 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1232 {"re.DOTALL", SRE_FLAG_DOTALL},
1233 {"re.UNICODE", SRE_FLAG_UNICODE},
1234 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1235 {"re.DEBUG", SRE_FLAG_DEBUG},
1236 {"re.ASCII", SRE_FLAG_ASCII},
1237 };
1238 PyObject *result = NULL;
1239 PyObject *flag_items;
1240 int i;
1241 int flags = obj->flags;
1242
1243 /* Omit re.UNICODE for valid string patterns. */
1244 if (obj->isbytes == 0 &&
1245 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1246 SRE_FLAG_UNICODE)
1247 flags &= ~SRE_FLAG_UNICODE;
1248
1249 flag_items = PyList_New(0);
1250 if (!flag_items)
1251 return NULL;
1252
1253 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1254 if (flags & flag_names[i].value) {
1255 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1256 if (!item)
1257 goto done;
1258
1259 if (PyList_Append(flag_items, item) < 0) {
1260 Py_DECREF(item);
1261 goto done;
1262 }
1263 Py_DECREF(item);
1264 flags &= ~flag_names[i].value;
1265 }
1266 }
1267 if (flags) {
1268 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1269 if (!item)
1270 goto done;
1271
1272 if (PyList_Append(flag_items, item) < 0) {
1273 Py_DECREF(item);
1274 goto done;
1275 }
1276 Py_DECREF(item);
1277 }
1278
1279 if (PyList_Size(flag_items) > 0) {
1280 PyObject *flags_result;
1281 PyObject *sep = PyUnicode_FromString("|");
1282 if (!sep)
1283 goto done;
1284 flags_result = PyUnicode_Join(sep, flag_items);
1285 Py_DECREF(sep);
1286 if (!flags_result)
1287 goto done;
1288 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1289 obj->pattern, flags_result);
1290 Py_DECREF(flags_result);
1291 }
1292 else {
1293 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1294 }
1295
1296done:
1297 Py_DECREF(flag_items);
1298 return result;
1299}
1300
Raymond Hettinger94478742004-09-24 04:31:19 +00001301PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001302"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001303 Matches zero or more characters at the beginning of the string");
1304
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001305PyDoc_STRVAR(pattern_fullmatch_doc,
1306"fullmatch(string[, pos[, endpos]]) -> match object or None.\n\
1307 Matches against all of the string");
1308
Raymond Hettinger94478742004-09-24 04:31:19 +00001309PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001310"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001311 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02001312 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001313
1314PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001315"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001316 Split string by the occurrences of pattern.");
1317
1318PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001319"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001320 Return a list of all non-overlapping matches of pattern in string.");
1321
1322PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001323"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001324 Return an iterator over all non-overlapping matches for the \n\
1325 RE pattern in string. For each match, the iterator returns a\n\
1326 match object.");
1327
1328PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001329"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001330 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00001331 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001332
1333PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001334"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001335 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
1336 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00001337 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001338
1339PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
1340
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001341static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00001342 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001343 pattern_match_doc},
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001344 {"fullmatch", (PyCFunction) pattern_fullmatch, METH_VARARGS|METH_KEYWORDS,
1345 pattern_fullmatch_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001346 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001347 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001348 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001349 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001350 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001351 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001352 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001353 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001354 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001355 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001356 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001357 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001358 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001359 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
1360 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001361 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001362};
1363
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00001364#define PAT_OFF(x) offsetof(PatternObject, x)
1365static PyMemberDef pattern_members[] = {
1366 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
1367 {"flags", T_INT, PAT_OFF(flags), READONLY},
1368 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
1369 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
1370 {NULL} /* Sentinel */
1371};
Guido van Rossumb700df92000-03-31 14:59:30 +00001372
Neal Norwitz57c179c2006-03-22 07:18:02 +00001373static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001374 PyVarObject_HEAD_INIT(NULL, 0)
1375 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001376 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001377 (destructor)pattern_dealloc, /* tp_dealloc */
1378 0, /* tp_print */
1379 0, /* tp_getattr */
1380 0, /* tp_setattr */
1381 0, /* tp_reserved */
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001382 (reprfunc)pattern_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001383 0, /* tp_as_number */
1384 0, /* tp_as_sequence */
1385 0, /* tp_as_mapping */
1386 0, /* tp_hash */
1387 0, /* tp_call */
1388 0, /* tp_str */
1389 0, /* tp_getattro */
1390 0, /* tp_setattro */
1391 0, /* tp_as_buffer */
1392 Py_TPFLAGS_DEFAULT, /* tp_flags */
1393 pattern_doc, /* tp_doc */
1394 0, /* tp_traverse */
1395 0, /* tp_clear */
1396 0, /* tp_richcompare */
1397 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
1398 0, /* tp_iter */
1399 0, /* tp_iternext */
1400 pattern_methods, /* tp_methods */
1401 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00001402};
1403
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001404static int _validate(PatternObject *self); /* Forward */
1405
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406static PyObject *
1407_compile(PyObject* self_, PyObject* args)
1408{
1409 /* "compile" pattern descriptor to pattern object */
1410
1411 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001412 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413
1414 PyObject* pattern;
1415 int flags = 0;
1416 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001417 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001418 PyObject* groupindex = NULL;
1419 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001420
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001421 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422 &PyList_Type, &code, &groups,
1423 &groupindex, &indexgroup))
1424 return NULL;
1425
1426 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001427 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001428 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1429 if (!self)
1430 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001431 self->weakreflist = NULL;
1432 self->pattern = NULL;
1433 self->groupindex = NULL;
1434 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001435
1436 self->codesize = n;
1437
1438 for (i = 0; i < n; i++) {
1439 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001440 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001441 self->code[i] = (SRE_CODE) value;
1442 if ((unsigned long) self->code[i] != value) {
1443 PyErr_SetString(PyExc_OverflowError,
1444 "regular expression code size limit exceeded");
1445 break;
1446 }
1447 }
1448
1449 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001450 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001451 return NULL;
1452 }
1453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001455 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 else {
1458 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001459 int charsize;
1460 Py_buffer view;
1461 view.buf = NULL;
1462 if (!getstring(pattern, &p_length, &self->isbytes,
1463 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 Py_DECREF(self);
1465 return NULL;
1466 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001467 if (view.buf)
1468 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001470
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001471 Py_INCREF(pattern);
1472 self->pattern = pattern;
1473
1474 self->flags = flags;
1475
1476 self->groups = groups;
1477
1478 Py_XINCREF(groupindex);
1479 self->groupindex = groupindex;
1480
1481 Py_XINCREF(indexgroup);
1482 self->indexgroup = indexgroup;
1483
1484 self->weakreflist = NULL;
1485
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001486 if (!_validate(self)) {
1487 Py_DECREF(self);
1488 return NULL;
1489 }
1490
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491 return (PyObject*) self;
1492}
1493
Guido van Rossumb700df92000-03-31 14:59:30 +00001494/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001495/* Code validation */
1496
1497/* To learn more about this code, have a look at the _compile() function in
1498 Lib/sre_compile.py. The validation functions below checks the code array
1499 for conformance with the code patterns generated there.
1500
1501 The nice thing about the generated code is that it is position-independent:
1502 all jumps are relative jumps forward. Also, jumps don't cross each other:
1503 the target of a later jump is always earlier than the target of an earlier
1504 jump. IOW, this is okay:
1505
1506 J---------J-------T--------T
1507 \ \_____/ /
1508 \______________________/
1509
1510 but this is not:
1511
1512 J---------J-------T--------T
1513 \_________\_____/ /
1514 \____________/
1515
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001516 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001517*/
1518
1519/* Defining this one enables tracing of the validator */
1520#undef VVERBOSE
1521
1522/* Trace macro for the validator */
1523#if defined(VVERBOSE)
1524#define VTRACE(v) printf v
1525#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001526#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001527#endif
1528
1529/* Report failure */
1530#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1531
1532/* Extract opcode, argument, or skip count from code array */
1533#define GET_OP \
1534 do { \
1535 VTRACE(("%p: ", code)); \
1536 if (code >= end) FAIL; \
1537 op = *code++; \
1538 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1539 } while (0)
1540#define GET_ARG \
1541 do { \
1542 VTRACE(("%p= ", code)); \
1543 if (code >= end) FAIL; \
1544 arg = *code++; \
1545 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1546 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001547#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001548 do { \
1549 VTRACE(("%p= ", code)); \
1550 if (code >= end) FAIL; \
1551 skip = *code; \
1552 VTRACE(("%lu (skip to %p)\n", \
1553 (unsigned long)skip, code+skip)); \
Victor Stinner1fa174a2013-08-28 02:06:21 +02001554 if (skip-adj > (Py_uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001555 FAIL; \
1556 code++; \
1557 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001558#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001559
1560static int
1561_validate_charset(SRE_CODE *code, SRE_CODE *end)
1562{
1563 /* Some variables are manipulated by the macros above */
1564 SRE_CODE op;
1565 SRE_CODE arg;
1566 SRE_CODE offset;
1567 int i;
1568
1569 while (code < end) {
1570 GET_OP;
1571 switch (op) {
1572
1573 case SRE_OP_NEGATE:
1574 break;
1575
1576 case SRE_OP_LITERAL:
1577 GET_ARG;
1578 break;
1579
1580 case SRE_OP_RANGE:
1581 GET_ARG;
1582 GET_ARG;
1583 break;
1584
1585 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001586 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001587 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001588 FAIL;
1589 code += offset;
1590 break;
1591
1592 case SRE_OP_BIGCHARSET:
1593 GET_ARG; /* Number of blocks */
1594 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001595 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001596 FAIL;
1597 /* Make sure that each byte points to a valid block */
1598 for (i = 0; i < 256; i++) {
1599 if (((unsigned char *)code)[i] >= arg)
1600 FAIL;
1601 }
1602 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001603 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001604 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001605 FAIL;
1606 code += offset;
1607 break;
1608
1609 case SRE_OP_CATEGORY:
1610 GET_ARG;
1611 switch (arg) {
1612 case SRE_CATEGORY_DIGIT:
1613 case SRE_CATEGORY_NOT_DIGIT:
1614 case SRE_CATEGORY_SPACE:
1615 case SRE_CATEGORY_NOT_SPACE:
1616 case SRE_CATEGORY_WORD:
1617 case SRE_CATEGORY_NOT_WORD:
1618 case SRE_CATEGORY_LINEBREAK:
1619 case SRE_CATEGORY_NOT_LINEBREAK:
1620 case SRE_CATEGORY_LOC_WORD:
1621 case SRE_CATEGORY_LOC_NOT_WORD:
1622 case SRE_CATEGORY_UNI_DIGIT:
1623 case SRE_CATEGORY_UNI_NOT_DIGIT:
1624 case SRE_CATEGORY_UNI_SPACE:
1625 case SRE_CATEGORY_UNI_NOT_SPACE:
1626 case SRE_CATEGORY_UNI_WORD:
1627 case SRE_CATEGORY_UNI_NOT_WORD:
1628 case SRE_CATEGORY_UNI_LINEBREAK:
1629 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1630 break;
1631 default:
1632 FAIL;
1633 }
1634 break;
1635
1636 default:
1637 FAIL;
1638
1639 }
1640 }
1641
1642 return 1;
1643}
1644
1645static int
1646_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1647{
1648 /* Some variables are manipulated by the macros above */
1649 SRE_CODE op;
1650 SRE_CODE arg;
1651 SRE_CODE skip;
1652
1653 VTRACE(("code=%p, end=%p\n", code, end));
1654
1655 if (code > end)
1656 FAIL;
1657
1658 while (code < end) {
1659 GET_OP;
1660 switch (op) {
1661
1662 case SRE_OP_MARK:
1663 /* We don't check whether marks are properly nested; the
1664 sre_match() code is robust even if they don't, and the worst
1665 you can get is nonsensical match results. */
1666 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001667 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001668 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1669 FAIL;
1670 }
1671 break;
1672
1673 case SRE_OP_LITERAL:
1674 case SRE_OP_NOT_LITERAL:
1675 case SRE_OP_LITERAL_IGNORE:
1676 case SRE_OP_NOT_LITERAL_IGNORE:
1677 GET_ARG;
1678 /* The arg is just a character, nothing to check */
1679 break;
1680
1681 case SRE_OP_SUCCESS:
1682 case SRE_OP_FAILURE:
1683 /* Nothing to check; these normally end the matching process */
1684 break;
1685
1686 case SRE_OP_AT:
1687 GET_ARG;
1688 switch (arg) {
1689 case SRE_AT_BEGINNING:
1690 case SRE_AT_BEGINNING_STRING:
1691 case SRE_AT_BEGINNING_LINE:
1692 case SRE_AT_END:
1693 case SRE_AT_END_LINE:
1694 case SRE_AT_END_STRING:
1695 case SRE_AT_BOUNDARY:
1696 case SRE_AT_NON_BOUNDARY:
1697 case SRE_AT_LOC_BOUNDARY:
1698 case SRE_AT_LOC_NON_BOUNDARY:
1699 case SRE_AT_UNI_BOUNDARY:
1700 case SRE_AT_UNI_NON_BOUNDARY:
1701 break;
1702 default:
1703 FAIL;
1704 }
1705 break;
1706
1707 case SRE_OP_ANY:
1708 case SRE_OP_ANY_ALL:
1709 /* These have no operands */
1710 break;
1711
1712 case SRE_OP_IN:
1713 case SRE_OP_IN_IGNORE:
1714 GET_SKIP;
1715 /* Stop 1 before the end; we check the FAILURE below */
1716 if (!_validate_charset(code, code+skip-2))
1717 FAIL;
1718 if (code[skip-2] != SRE_OP_FAILURE)
1719 FAIL;
1720 code += skip-1;
1721 break;
1722
1723 case SRE_OP_INFO:
1724 {
1725 /* A minimal info field is
1726 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1727 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1728 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001729 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001730 SRE_CODE *newcode;
1731 GET_SKIP;
1732 newcode = code+skip-1;
1733 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001734 GET_ARG;
1735 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001736 /* Check that only valid flags are present */
1737 if ((flags & ~(SRE_INFO_PREFIX |
1738 SRE_INFO_LITERAL |
1739 SRE_INFO_CHARSET)) != 0)
1740 FAIL;
1741 /* PREFIX and CHARSET are mutually exclusive */
1742 if ((flags & SRE_INFO_PREFIX) &&
1743 (flags & SRE_INFO_CHARSET))
1744 FAIL;
1745 /* LITERAL implies PREFIX */
1746 if ((flags & SRE_INFO_LITERAL) &&
1747 !(flags & SRE_INFO_PREFIX))
1748 FAIL;
1749 /* Validate the prefix */
1750 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001751 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001752 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001753 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001754 /* Here comes the prefix string */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001755 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001756 FAIL;
1757 code += prefix_len;
1758 /* And here comes the overlap table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001759 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001760 FAIL;
1761 /* Each overlap value should be < prefix_len */
1762 for (i = 0; i < prefix_len; i++) {
1763 if (code[i] >= prefix_len)
1764 FAIL;
1765 }
1766 code += prefix_len;
1767 }
1768 /* Validate the charset */
1769 if (flags & SRE_INFO_CHARSET) {
1770 if (!_validate_charset(code, newcode-1))
1771 FAIL;
1772 if (newcode[-1] != SRE_OP_FAILURE)
1773 FAIL;
1774 code = newcode;
1775 }
1776 else if (code != newcode) {
1777 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1778 FAIL;
1779 }
1780 }
1781 break;
1782
1783 case SRE_OP_BRANCH:
1784 {
1785 SRE_CODE *target = NULL;
1786 for (;;) {
1787 GET_SKIP;
1788 if (skip == 0)
1789 break;
1790 /* Stop 2 before the end; we check the JUMP below */
1791 if (!_validate_inner(code, code+skip-3, groups))
1792 FAIL;
1793 code += skip-3;
1794 /* Check that it ends with a JUMP, and that each JUMP
1795 has the same target */
1796 GET_OP;
1797 if (op != SRE_OP_JUMP)
1798 FAIL;
1799 GET_SKIP;
1800 if (target == NULL)
1801 target = code+skip-1;
1802 else if (code+skip-1 != target)
1803 FAIL;
1804 }
1805 }
1806 break;
1807
1808 case SRE_OP_REPEAT_ONE:
1809 case SRE_OP_MIN_REPEAT_ONE:
1810 {
1811 SRE_CODE min, max;
1812 GET_SKIP;
1813 GET_ARG; min = arg;
1814 GET_ARG; max = arg;
1815 if (min > max)
1816 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001817 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001818 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001819 if (!_validate_inner(code, code+skip-4, groups))
1820 FAIL;
1821 code += skip-4;
1822 GET_OP;
1823 if (op != SRE_OP_SUCCESS)
1824 FAIL;
1825 }
1826 break;
1827
1828 case SRE_OP_REPEAT:
1829 {
1830 SRE_CODE min, max;
1831 GET_SKIP;
1832 GET_ARG; min = arg;
1833 GET_ARG; max = arg;
1834 if (min > max)
1835 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001836 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001837 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001838 if (!_validate_inner(code, code+skip-3, groups))
1839 FAIL;
1840 code += skip-3;
1841 GET_OP;
1842 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1843 FAIL;
1844 }
1845 break;
1846
1847 case SRE_OP_GROUPREF:
1848 case SRE_OP_GROUPREF_IGNORE:
1849 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001850 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001851 FAIL;
1852 break;
1853
1854 case SRE_OP_GROUPREF_EXISTS:
1855 /* The regex syntax for this is: '(?(group)then|else)', where
1856 'group' is either an integer group number or a group name,
1857 'then' and 'else' are sub-regexes, and 'else' is optional. */
1858 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001859 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001860 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001861 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001862 code--; /* The skip is relative to the first arg! */
1863 /* There are two possibilities here: if there is both a 'then'
1864 part and an 'else' part, the generated code looks like:
1865
1866 GROUPREF_EXISTS
1867 <group>
1868 <skipyes>
1869 ...then part...
1870 JUMP
1871 <skipno>
1872 (<skipyes> jumps here)
1873 ...else part...
1874 (<skipno> jumps here)
1875
1876 If there is only a 'then' part, it looks like:
1877
1878 GROUPREF_EXISTS
1879 <group>
1880 <skip>
1881 ...then part...
1882 (<skip> jumps here)
1883
1884 There is no direct way to decide which it is, and we don't want
1885 to allow arbitrary jumps anywhere in the code; so we just look
1886 for a JUMP opcode preceding our skip target.
1887 */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001888 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001889 code[skip-3] == SRE_OP_JUMP)
1890 {
1891 VTRACE(("both then and else parts present\n"));
1892 if (!_validate_inner(code+1, code+skip-3, groups))
1893 FAIL;
1894 code += skip-2; /* Position after JUMP, at <skipno> */
1895 GET_SKIP;
1896 if (!_validate_inner(code, code+skip-1, groups))
1897 FAIL;
1898 code += skip-1;
1899 }
1900 else {
1901 VTRACE(("only a then part present\n"));
1902 if (!_validate_inner(code+1, code+skip-1, groups))
1903 FAIL;
1904 code += skip-1;
1905 }
1906 break;
1907
1908 case SRE_OP_ASSERT:
1909 case SRE_OP_ASSERT_NOT:
1910 GET_SKIP;
1911 GET_ARG; /* 0 for lookahead, width for lookbehind */
1912 code--; /* Back up over arg to simplify math below */
1913 if (arg & 0x80000000)
1914 FAIL; /* Width too large */
1915 /* Stop 1 before the end; we check the SUCCESS below */
1916 if (!_validate_inner(code+1, code+skip-2, groups))
1917 FAIL;
1918 code += skip-2;
1919 GET_OP;
1920 if (op != SRE_OP_SUCCESS)
1921 FAIL;
1922 break;
1923
1924 default:
1925 FAIL;
1926
1927 }
1928 }
1929
1930 VTRACE(("okay\n"));
1931 return 1;
1932}
1933
1934static int
1935_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1936{
1937 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
1938 FAIL;
1939 if (groups == 0) /* fix for simplejson */
1940 groups = 100; /* 100 groups should always be safe */
1941 return _validate_inner(code, end-1, groups);
1942}
1943
1944static int
1945_validate(PatternObject *self)
1946{
1947 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1948 {
1949 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1950 return 0;
1951 }
1952 else
1953 VTRACE(("Success!\n"));
1954 return 1;
1955}
1956
1957/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001958/* match methods */
1959
1960static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001961match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001962{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001963 Py_XDECREF(self->regs);
1964 Py_XDECREF(self->string);
1965 Py_DECREF(self->pattern);
1966 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001967}
1968
1969static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001970match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001971{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001972 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001973 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001974 Py_buffer view;
1975 PyObject *result;
1976 void* ptr;
1977
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001978 if (index < 0 || index >= self->groups) {
1979 /* raise IndexError if we were given a bad group number */
1980 PyErr_SetString(
1981 PyExc_IndexError,
1982 "no such group"
1983 );
1984 return NULL;
1985 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001986
Fredrik Lundh6f013982000-07-03 18:44:21 +00001987 index *= 2;
1988
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001989 if (self->string == Py_None || self->mark[index] < 0) {
1990 /* return default value if the string or group is undefined */
1991 Py_INCREF(def);
1992 return def;
1993 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001994
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001995 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001996 if (ptr == NULL)
1997 return NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001998 result = getslice(isbytes, ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001999 self->string, self->mark[index], self->mark[index+1]);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002000 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03002001 PyBuffer_Release(&view);
2002 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002003}
2004
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002005static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002006match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002007{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002008 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002009
Guido van Rossumddefaf32007-01-14 03:31:43 +00002010 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002011 /* Default value */
2012 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00002013
Christian Heimes217cfd12007-12-02 14:31:20 +00002014 if (PyLong_Check(index))
2015 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002016
Fredrik Lundh6f013982000-07-03 18:44:21 +00002017 i = -1;
2018
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002019 if (self->pattern->groupindex) {
2020 index = PyObject_GetItem(self->pattern->groupindex, index);
2021 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00002022 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00002023 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002024 Py_DECREF(index);
2025 } else
2026 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002027 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002028
2029 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002030}
2031
2032static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002033match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002034{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002035 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002036}
2037
2038static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002039match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002040{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002041 /* delegate to Python code */
2042 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002043 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002044 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002045 );
2046}
2047
2048static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002049match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002050{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002052 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002055
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 switch (size) {
2057 case 0:
2058 result = match_getslice(self, Py_False, Py_None);
2059 break;
2060 case 1:
2061 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2062 break;
2063 default:
2064 /* fetch multiple items */
2065 result = PyTuple_New(size);
2066 if (!result)
2067 return NULL;
2068 for (i = 0; i < size; i++) {
2069 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002070 self, PyTuple_GET_ITEM(args, i), Py_None
2071 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 if (!item) {
2073 Py_DECREF(result);
2074 return NULL;
2075 }
2076 PyTuple_SET_ITEM(result, i, item);
2077 }
2078 break;
2079 }
2080 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002081}
2082
2083static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002084match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002085{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002087 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002088
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002089 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002090 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002091 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002092 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002093
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002094 result = PyTuple_New(self->groups-1);
2095 if (!result)
2096 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002097
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002098 for (index = 1; index < self->groups; index++) {
2099 PyObject* item;
2100 item = match_getslice_by_index(self, index, def);
2101 if (!item) {
2102 Py_DECREF(result);
2103 return NULL;
2104 }
2105 PyTuple_SET_ITEM(result, index-1, item);
2106 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002107
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002108 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002109}
2110
2111static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002112match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002113{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002114 PyObject* result;
2115 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002116 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002117
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002118 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002119 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002120 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002121 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002122
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002123 result = PyDict_New();
2124 if (!result || !self->pattern->groupindex)
2125 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002128 if (!keys)
2129 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002130
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002131 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002132 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002133 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002134 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002136 if (!key)
2137 goto failed;
2138 value = match_getslice(self, key, def);
2139 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002140 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002141 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002142 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002143 status = PyDict_SetItem(result, key, value);
2144 Py_DECREF(value);
2145 if (status < 0)
2146 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002147 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002148
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002149 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002150
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002151 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002152
2153failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002154 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002155 Py_DECREF(result);
2156 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002157}
2158
2159static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002160match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002161{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002162 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002163
Guido van Rossumddefaf32007-01-14 03:31:43 +00002164 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002165 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002166 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002167
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002168 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002169
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002170 if (index < 0 || index >= self->groups) {
2171 PyErr_SetString(
2172 PyExc_IndexError,
2173 "no such group"
2174 );
2175 return NULL;
2176 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002177
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002178 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002179 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002180}
2181
2182static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002183match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002184{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002185 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002186
Guido van Rossumddefaf32007-01-14 03:31:43 +00002187 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002188 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002189 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002190
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002191 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002192
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002193 if (index < 0 || index >= self->groups) {
2194 PyErr_SetString(
2195 PyExc_IndexError,
2196 "no such group"
2197 );
2198 return NULL;
2199 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002200
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002201 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002202 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002203}
2204
2205LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002206_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002207{
2208 PyObject* pair;
2209 PyObject* item;
2210
2211 pair = PyTuple_New(2);
2212 if (!pair)
2213 return NULL;
2214
Christian Heimes217cfd12007-12-02 14:31:20 +00002215 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002216 if (!item)
2217 goto error;
2218 PyTuple_SET_ITEM(pair, 0, item);
2219
Christian Heimes217cfd12007-12-02 14:31:20 +00002220 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002221 if (!item)
2222 goto error;
2223 PyTuple_SET_ITEM(pair, 1, item);
2224
2225 return pair;
2226
2227 error:
2228 Py_DECREF(pair);
2229 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002230}
2231
2232static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002233match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002234{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002235 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002236
Guido van Rossumddefaf32007-01-14 03:31:43 +00002237 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002238 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002239 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002240
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002241 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002242
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002243 if (index < 0 || index >= self->groups) {
2244 PyErr_SetString(
2245 PyExc_IndexError,
2246 "no such group"
2247 );
2248 return NULL;
2249 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002250
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002251 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002252 return _pair(self->mark[index*2], self->mark[index*2+1]);
2253}
2254
2255static PyObject*
2256match_regs(MatchObject* self)
2257{
2258 PyObject* regs;
2259 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002260 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002261
2262 regs = PyTuple_New(self->groups);
2263 if (!regs)
2264 return NULL;
2265
2266 for (index = 0; index < self->groups; index++) {
2267 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2268 if (!item) {
2269 Py_DECREF(regs);
2270 return NULL;
2271 }
2272 PyTuple_SET_ITEM(regs, index, item);
2273 }
2274
2275 Py_INCREF(regs);
2276 self->regs = regs;
2277
2278 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002279}
2280
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002281static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002282match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002283{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002284#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002285 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002286 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00002287
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002288 slots = 2 * (self->pattern->groups+1);
2289
2290 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2291 if (!copy)
2292 return NULL;
2293
2294 /* this value a constant, but any compiler should be able to
2295 figure that out all by itself */
2296 offset = offsetof(MatchObject, string);
2297
2298 Py_XINCREF(self->pattern);
2299 Py_XINCREF(self->string);
2300 Py_XINCREF(self->regs);
2301
2302 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002303 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002304
2305 return (PyObject*) copy;
2306#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002307 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002308 return NULL;
2309#endif
2310}
2311
2312static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002313match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002314{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002315#ifdef USE_BUILTIN_COPY
2316 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002317
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002318 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002319 if (!copy)
2320 return NULL;
2321
2322 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2323 !deepcopy(&copy->string, memo) ||
2324 !deepcopy(&copy->regs, memo)) {
2325 Py_DECREF(copy);
2326 return NULL;
2327 }
2328
2329#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002330 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2331 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002332#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002333}
2334
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002335PyDoc_STRVAR(match_doc,
2336"The result of re.match() and re.search().\n\
2337Match objects always have a boolean value of True.");
2338
2339PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002340"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002341 Return subgroup(s) of the match by indices or names.\n\
2342 For 0 returns the entire match.");
2343
2344PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002345"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002346 Return index of the start of the substring matched by group.");
2347
2348PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002349"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002350 Return index of the end of the substring matched by group.");
2351
2352PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002353"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002354 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
2355
2356PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002357"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002358 Return a tuple containing all the subgroups of the match, from 1.\n\
2359 The default argument is used for groups\n\
2360 that did not participate in the match");
2361
2362PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002363"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002364 Return a dictionary containing all the named subgroups of the match,\n\
2365 keyed by the subgroup name. The default argument is used for groups\n\
2366 that did not participate in the match");
2367
2368PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002369"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002370 Return the string obtained by doing backslash substitution\n\
2371 on the string template, as done by the sub() method.");
2372
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002373static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002374 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2375 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
2376 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
2377 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
2378 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
2379 match_groups_doc},
2380 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
2381 match_groupdict_doc},
2382 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002383 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
2384 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002385 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002386};
2387
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002388static PyObject *
2389match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002390{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002391 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002392 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002393 Py_INCREF(Py_None);
2394 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00002395}
2396
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002397static PyObject *
2398match_lastgroup_get(MatchObject *self)
2399{
2400 if (self->pattern->indexgroup && self->lastindex >= 0) {
2401 PyObject* result = PySequence_GetItem(
2402 self->pattern->indexgroup, self->lastindex
2403 );
2404 if (result)
2405 return result;
2406 PyErr_Clear();
2407 }
2408 Py_INCREF(Py_None);
2409 return Py_None;
2410}
2411
2412static PyObject *
2413match_regs_get(MatchObject *self)
2414{
2415 if (self->regs) {
2416 Py_INCREF(self->regs);
2417 return self->regs;
2418 } else
2419 return match_regs(self);
2420}
2421
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002422static PyObject *
2423match_repr(MatchObject *self)
2424{
2425 PyObject *result;
2426 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2427 if (group0 == NULL)
2428 return NULL;
2429 result = PyUnicode_FromFormat(
2430 "<%s object; span=(%d, %d), match=%.50R>",
2431 Py_TYPE(self)->tp_name,
2432 self->mark[0], self->mark[1], group0);
2433 Py_DECREF(group0);
2434 return result;
2435}
2436
2437
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002438static PyGetSetDef match_getset[] = {
2439 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
2440 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
2441 {"regs", (getter)match_regs_get, (setter)NULL},
2442 {NULL}
2443};
2444
2445#define MATCH_OFF(x) offsetof(MatchObject, x)
2446static PyMemberDef match_members[] = {
2447 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
2448 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
2449 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
2450 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
2451 {NULL}
2452};
2453
Guido van Rossumb700df92000-03-31 14:59:30 +00002454/* FIXME: implement setattr("string", None) as a special case (to
2455 detach the associated string, if any */
2456
Neal Norwitz57c179c2006-03-22 07:18:02 +00002457static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002458 PyVarObject_HEAD_INIT(NULL,0)
2459 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002460 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002461 (destructor)match_dealloc, /* tp_dealloc */
2462 0, /* tp_print */
2463 0, /* tp_getattr */
2464 0, /* tp_setattr */
2465 0, /* tp_reserved */
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002466 (reprfunc)match_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002467 0, /* tp_as_number */
2468 0, /* tp_as_sequence */
2469 0, /* tp_as_mapping */
2470 0, /* tp_hash */
2471 0, /* tp_call */
2472 0, /* tp_str */
2473 0, /* tp_getattro */
2474 0, /* tp_setattro */
2475 0, /* tp_as_buffer */
2476 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002477 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002478 0, /* tp_traverse */
2479 0, /* tp_clear */
2480 0, /* tp_richcompare */
2481 0, /* tp_weaklistoffset */
2482 0, /* tp_iter */
2483 0, /* tp_iternext */
2484 match_methods, /* tp_methods */
2485 match_members, /* tp_members */
2486 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002487};
2488
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002489static PyObject*
Victor Stinnerf5587782013-11-15 23:21:11 +01002490pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002491{
2492 /* create match object (from state object) */
2493
2494 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002495 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002496 char* base;
2497 int n;
2498
2499 if (status > 0) {
2500
2501 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002502 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002503 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
2504 2*(pattern->groups+1));
2505 if (!match)
2506 return NULL;
2507
2508 Py_INCREF(pattern);
2509 match->pattern = pattern;
2510
2511 Py_INCREF(state->string);
2512 match->string = state->string;
2513
2514 match->regs = NULL;
2515 match->groups = pattern->groups+1;
2516
2517 /* fill in group slices */
2518
2519 base = (char*) state->beginning;
2520 n = state->charsize;
2521
2522 match->mark[0] = ((char*) state->start - base) / n;
2523 match->mark[1] = ((char*) state->ptr - base) / n;
2524
2525 for (i = j = 0; i < pattern->groups; i++, j+=2)
2526 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2527 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2528 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2529 } else
2530 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2531
2532 match->pos = state->pos;
2533 match->endpos = state->endpos;
2534
2535 match->lastindex = state->lastindex;
2536
2537 return (PyObject*) match;
2538
2539 } else if (status == 0) {
2540
2541 /* no match */
2542 Py_INCREF(Py_None);
2543 return Py_None;
2544
2545 }
2546
2547 /* internal error */
2548 pattern_error(status);
2549 return NULL;
2550}
2551
2552
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002553/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002554/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002555
2556static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002557scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002558{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002559 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002560 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002561 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002562}
2563
2564static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002565scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002566{
2567 SRE_STATE* state = &self->state;
2568 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002569 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002570
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002571 state_reset(state);
2572
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002573 state->ptr = state->start;
2574
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002575 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002576 if (PyErr_Occurred())
2577 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002578
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002579 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002580 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002581
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00002582 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002583 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002584 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002585 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002586
2587 return match;
2588}
2589
2590
2591static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002592scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002593{
2594 SRE_STATE* state = &self->state;
2595 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002596 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002597
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002598 state_reset(state);
2599
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002600 state->ptr = state->start;
2601
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002602 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002603 if (PyErr_Occurred())
2604 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002605
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002606 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002607 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002608
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00002609 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002610 state->start = (void*) ((char*) state->ptr + state->charsize);
2611 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002612 state->start = state->ptr;
2613
2614 return match;
2615}
2616
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002617static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002618 {"match", (PyCFunction) scanner_match, METH_NOARGS},
2619 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002620 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002621};
2622
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002623#define SCAN_OFF(x) offsetof(ScannerObject, x)
2624static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03002625 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002626 {NULL} /* Sentinel */
2627};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002628
Neal Norwitz57c179c2006-03-22 07:18:02 +00002629static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002630 PyVarObject_HEAD_INIT(NULL, 0)
2631 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002632 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002633 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002634 0, /* tp_print */
2635 0, /* tp_getattr */
2636 0, /* tp_setattr */
2637 0, /* tp_reserved */
2638 0, /* tp_repr */
2639 0, /* tp_as_number */
2640 0, /* tp_as_sequence */
2641 0, /* tp_as_mapping */
2642 0, /* tp_hash */
2643 0, /* tp_call */
2644 0, /* tp_str */
2645 0, /* tp_getattro */
2646 0, /* tp_setattro */
2647 0, /* tp_as_buffer */
2648 Py_TPFLAGS_DEFAULT, /* tp_flags */
2649 0, /* tp_doc */
2650 0, /* tp_traverse */
2651 0, /* tp_clear */
2652 0, /* tp_richcompare */
2653 0, /* tp_weaklistoffset */
2654 0, /* tp_iter */
2655 0, /* tp_iternext */
2656 scanner_methods, /* tp_methods */
2657 scanner_members, /* tp_members */
2658 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002659};
2660
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002661static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002662pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002663{
2664 /* create search state object */
2665
2666 ScannerObject* self;
2667
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02002668 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002669 Py_ssize_t start = 0;
2670 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02002671 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2672 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:scanner", kwlist,
2673 &string, &start, &end, &string2))
2674 return NULL;
2675
2676 string = fix_string_param(string, string2, "source");
2677 if (!string)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002678 return NULL;
2679
2680 /* create scanner object */
2681 self = PyObject_NEW(ScannerObject, &Scanner_Type);
2682 if (!self)
2683 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002684 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002685
2686 string = state_init(&self->state, pattern, string, start, end);
2687 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002688 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002689 return NULL;
2690 }
2691
2692 Py_INCREF(pattern);
2693 self->pattern = (PyObject*) pattern;
2694
2695 return (PyObject*) self;
2696}
2697
Guido van Rossumb700df92000-03-31 14:59:30 +00002698static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00002699 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002700 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00002701 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002702 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002703};
2704
Martin v. Löwis1a214512008-06-11 05:26:20 +00002705static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002706 PyModuleDef_HEAD_INIT,
2707 "_" SRE_MODULE,
2708 NULL,
2709 -1,
2710 _functions,
2711 NULL,
2712 NULL,
2713 NULL,
2714 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00002715};
2716
2717PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002718{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002719 PyObject* m;
2720 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002721 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002722
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00002723 /* Patch object types */
2724 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
2725 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00002726 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002727
Martin v. Löwis1a214512008-06-11 05:26:20 +00002728 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00002729 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002730 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002731 d = PyModule_GetDict(m);
2732
Christian Heimes217cfd12007-12-02 14:31:20 +00002733 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002734 if (x) {
2735 PyDict_SetItemString(d, "MAGIC", x);
2736 Py_DECREF(x);
2737 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002738
Christian Heimes217cfd12007-12-02 14:31:20 +00002739 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00002740 if (x) {
2741 PyDict_SetItemString(d, "CODESIZE", x);
2742 Py_DECREF(x);
2743 }
2744
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02002745 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
2746 if (x) {
2747 PyDict_SetItemString(d, "MAXREPEAT", x);
2748 Py_DECREF(x);
2749 }
2750
Neal Norwitzfe537132007-08-26 03:55:15 +00002751 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002752 if (x) {
2753 PyDict_SetItemString(d, "copyright", x);
2754 Py_DECREF(x);
2755 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00002756 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00002757}
2758
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002759/* vim:ts=4:sw=4:et
2760*/