blob: 5a8f839c75c0bcc48760b19d91679d9c33f0f591 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000038static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000044#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000045
46#include "sre.h"
47
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030048#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
49
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000050#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000051
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000053#if !defined(SRE_MODULE)
54#define SRE_MODULE "sre"
55#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000056
Thomas Wouters9ada3d62006-04-21 09:47:09 +000057#define SRE_PY_MODULE "re"
58
Guido van Rossumb700df92000-03-31 14:59:30 +000059/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000060#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000063/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064
65/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000066#define USE_FAST_SEARCH
67
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000068/* enables copy/deepcopy handling (work in progress) */
69#undef USE_BUILTIN_COPY
70
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071/* -------------------------------------------------------------------- */
72
Fredrik Lundh80946112000-06-29 18:03:25 +000073#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000074#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000075#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000076/* fastest possible local call under MSVC */
77#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000078#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000079#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#else
81#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000082#endif
83
84/* error codes */
85#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000086#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000087#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000088#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000089#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000090
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000092#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000093#else
94#define TRACE(v)
95#endif
96
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000097/* -------------------------------------------------------------------- */
98/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000100/* default character predicates (run sre_chars.py to regenerate tables) */
101
102#define SRE_DIGIT_MASK 1
103#define SRE_SPACE_MASK 2
104#define SRE_LINEBREAK_MASK 4
105#define SRE_ALNUM_MASK 8
106#define SRE_WORD_MASK 16
107
Fredrik Lundh21009b92001-09-18 18:47:09 +0000108/* FIXME: this assumes ASCII. create tables in init_sre() instead */
109
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000110static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1112, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1120, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11325, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11424, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1150, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
11624, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
117
Fredrik Lundhb389df32000-06-29 12:48:37 +0000118static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000011910, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12027, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12144, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12261, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
123108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
124122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
125106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
126120, 121, 122, 123, 124, 125, 126, 127 };
127
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000128#define SRE_IS_DIGIT(ch)\
129 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
130#define SRE_IS_SPACE(ch)\
131 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
132#define SRE_IS_LINEBREAK(ch)\
133 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
134#define SRE_IS_ALNUM(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
136#define SRE_IS_WORD(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000138
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000139static unsigned int sre_lower(unsigned int ch)
140{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000141 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000142}
143
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000144/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000145/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
146 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000148#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
149
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000150static unsigned int sre_lower_locale(unsigned int ch)
151{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000152 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000153}
154
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000155/* unicode-specific character predicates */
156
Victor Stinner0058b862011-09-29 03:27:47 +0200157#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
158#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
159#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
160#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
161#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162
163static unsigned int sre_lower_unicode(unsigned int ch)
164{
Victor Stinner0058b862011-09-29 03:27:47 +0200165 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000166}
167
Guido van Rossumb700df92000-03-31 14:59:30 +0000168LOCAL(int)
169sre_category(SRE_CODE category, unsigned int ch)
170{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000171 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000172
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000173 case SRE_CATEGORY_DIGIT:
174 return SRE_IS_DIGIT(ch);
175 case SRE_CATEGORY_NOT_DIGIT:
176 return !SRE_IS_DIGIT(ch);
177 case SRE_CATEGORY_SPACE:
178 return SRE_IS_SPACE(ch);
179 case SRE_CATEGORY_NOT_SPACE:
180 return !SRE_IS_SPACE(ch);
181 case SRE_CATEGORY_WORD:
182 return SRE_IS_WORD(ch);
183 case SRE_CATEGORY_NOT_WORD:
184 return !SRE_IS_WORD(ch);
185 case SRE_CATEGORY_LINEBREAK:
186 return SRE_IS_LINEBREAK(ch);
187 case SRE_CATEGORY_NOT_LINEBREAK:
188 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000189
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000190 case SRE_CATEGORY_LOC_WORD:
191 return SRE_LOC_IS_WORD(ch);
192 case SRE_CATEGORY_LOC_NOT_WORD:
193 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000194
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000195 case SRE_CATEGORY_UNI_DIGIT:
196 return SRE_UNI_IS_DIGIT(ch);
197 case SRE_CATEGORY_UNI_NOT_DIGIT:
198 return !SRE_UNI_IS_DIGIT(ch);
199 case SRE_CATEGORY_UNI_SPACE:
200 return SRE_UNI_IS_SPACE(ch);
201 case SRE_CATEGORY_UNI_NOT_SPACE:
202 return !SRE_UNI_IS_SPACE(ch);
203 case SRE_CATEGORY_UNI_WORD:
204 return SRE_UNI_IS_WORD(ch);
205 case SRE_CATEGORY_UNI_NOT_WORD:
206 return !SRE_UNI_IS_WORD(ch);
207 case SRE_CATEGORY_UNI_LINEBREAK:
208 return SRE_UNI_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
210 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000211 }
212 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000213}
214
215/* helpers */
216
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000217static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000218data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000219{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000220 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000223 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000224 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000225}
226
227static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000228data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000229{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000230 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 minsize = state->data_stack_base+size;
232 cursize = state->data_stack_size;
233 if (cursize < minsize) {
234 void* stack;
235 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300236 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000239 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000240 return SRE_ERROR_MEMORY;
241 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000242 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000243 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000244 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000245 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000246}
247
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000248/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000249
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300250#define SRE_CHAR Py_UCS1
251#define SIZEOF_SRE_CHAR 1
252#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300253#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000254
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300255/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000256
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300257#define SRE_CHAR Py_UCS2
258#define SIZEOF_SRE_CHAR 2
259#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300260#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000261
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300262/* generate 32-bit unicode version */
263
264#define SRE_CHAR Py_UCS4
265#define SIZEOF_SRE_CHAR 4
266#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300267#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000268
269/* -------------------------------------------------------------------- */
270/* factories and destructors */
271
272/* see sre.h for object declarations */
Victor Stinnerf5587782013-11-15 23:21:11 +0100273static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600274static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +0000275
276static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000277sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +0000278{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +0100279 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +0000280}
281
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000282static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +0000283sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000284{
285 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000286 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000287 return NULL;
288 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000289 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000290 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000291 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +0000292 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000293}
294
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000295LOCAL(void)
296state_reset(SRE_STATE* state)
297{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000298 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000299 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000300
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000301 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000302 state->lastindex = -1;
303
304 state->repeat = NULL;
305
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000306 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000307}
308
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000309static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200310getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300311 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600312 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000313{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000314 /* given a python object, return a data pointer, a length (in
315 characters), and a character size. return NULL if the object
316 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000317
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000318 /* Unicode objects do not support the buffer API. So, get the data
319 directly instead. */
320 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 if (PyUnicode_READY(string) == -1)
322 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200323 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200324 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300325 *p_isbytes = 0;
326 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000327 }
328
Victor Stinner0058b862011-09-29 03:27:47 +0200329 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300330 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
331 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
332 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000333 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000334
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300335 *p_length = view->len;
336 *p_charsize = 1;
337 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000338
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300339 if (view->buf == NULL) {
340 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
341 PyBuffer_Release(view);
342 view->buf = NULL;
343 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300345 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000346}
347
348LOCAL(PyObject*)
349state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000350 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000351{
352 /* prepare state object */
353
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000354 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300355 int isbytes, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000356 void* ptr;
357
358 memset(state, 0, sizeof(SRE_STATE));
359
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000360 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000361 state->lastindex = -1;
362
Benjamin Petersone48944b2012-03-07 14:50:25 -0600363 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300364 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000365 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600366 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000367
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300368 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600369 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300370 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600371 goto err;
372 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300373 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600374 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300375 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600376 goto err;
377 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000378
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000379 /* adjust boundaries */
380 if (start < 0)
381 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000382 else if (start > length)
383 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000384
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000385 if (end < 0)
386 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000387 else if (end > length)
388 end = length;
389
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300390 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000391 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000395 state->start = (void*) ((char*) ptr + start * state->charsize);
396 state->end = (void*) ((char*) ptr + end * state->charsize);
397
398 Py_INCREF(string);
399 state->string = string;
400 state->pos = start;
401 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000402
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000403 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000404 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000405 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000406 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000407 else
Fredrik Lundhb389df32000-06-29 12:48:37 +0000408 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000410 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600411 err:
412 if (state->buffer.buf)
413 PyBuffer_Release(&state->buffer);
414 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000415}
416
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000417LOCAL(void)
418state_fini(SRE_STATE* state)
419{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600420 if (state->buffer.buf)
421 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000422 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000423 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000424}
425
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000426/* calculate offset from start of string */
427#define STATE_OFFSET(state, member)\
428 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
429
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000430LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300431getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300432 PyObject* string, Py_ssize_t start, Py_ssize_t end)
433{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300434 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300435 if (PyBytes_CheckExact(string) &&
436 start == 0 && end == PyBytes_GET_SIZE(string)) {
437 Py_INCREF(string);
438 return string;
439 }
440 return PyBytes_FromStringAndSize(
441 (const char *)ptr + start, end - start);
442 }
443 else {
444 return PyUnicode_Substring(string, start, end);
445 }
446}
447
448LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000449state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000451 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000452
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000453 index = (index - 1) * 2;
454
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000455 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000456 if (empty)
457 /* want empty string */
458 i = j = 0;
459 else {
460 Py_INCREF(Py_None);
461 return Py_None;
462 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000463 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000464 i = STATE_OFFSET(state, state->mark[index]);
465 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000466 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000467
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300468 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000469}
470
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000471static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100472pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000473{
474 switch (status) {
475 case SRE_ERROR_RECURSION_LIMIT:
476 PyErr_SetString(
477 PyExc_RuntimeError,
478 "maximum recursion limit exceeded"
479 );
480 break;
481 case SRE_ERROR_MEMORY:
482 PyErr_NoMemory();
483 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000484 case SRE_ERROR_INTERRUPTED:
485 /* An exception has already been raised, so let it fly */
486 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000487 default:
488 /* other error codes indicate compiler/engine bugs */
489 PyErr_SetString(
490 PyExc_RuntimeError,
491 "internal error in regular expression engine"
492 );
493 }
494}
495
Guido van Rossumb700df92000-03-31 14:59:30 +0000496static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000497pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000498{
Raymond Hettinger027bb632004-05-31 03:09:25 +0000499 if (self->weakreflist != NULL)
500 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000501 Py_XDECREF(self->pattern);
502 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000503 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000504 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +0000505}
506
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300507LOCAL(Py_ssize_t)
508sre_match(SRE_STATE* state, SRE_CODE* pattern)
509{
510 if (state->charsize == 1)
511 return sre_ucs1_match(state, pattern);
512 if (state->charsize == 2)
513 return sre_ucs2_match(state, pattern);
514 assert(state->charsize == 4);
515 return sre_ucs4_match(state, pattern);
516}
517
518LOCAL(Py_ssize_t)
519sre_search(SRE_STATE* state, SRE_CODE* pattern)
520{
521 if (state->charsize == 1)
522 return sre_ucs1_search(state, pattern);
523 if (state->charsize == 2)
524 return sre_ucs2_search(state, pattern);
525 assert(state->charsize == 4);
526 return sre_ucs4_search(state, pattern);
527}
528
Larry Hastingsdf7c22b2014-01-07 14:25:26 -0800529/*[clinic input]
Larry Hastings16c51912014-01-07 11:53:01 -0800530module _sre
531class _sre.SRE_Pattern
532
533_sre.SRE_Pattern.match as pattern_match
534
535 self: self(type="PatternObject *")
536 pattern: object
537 pos: Py_ssize_t = 0
538 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
539
540Matches zero or more characters at the beginning of the string.
Larry Hastingsdf7c22b2014-01-07 14:25:26 -0800541[clinic start generated code]*/
Larry Hastings16c51912014-01-07 11:53:01 -0800542
543PyDoc_STRVAR(pattern_match__doc__,
544"match(pattern, pos=0, endpos=sys.maxsize)\n"
545"Matches zero or more characters at the beginning of the string.");
546
547#define PATTERN_MATCH_METHODDEF \
548 {"match", (PyCFunction)pattern_match, METH_VARARGS|METH_KEYWORDS, pattern_match__doc__},
549
550static PyObject *
551pattern_match_impl(PatternObject *self, PyObject *pattern, Py_ssize_t pos, Py_ssize_t endpos);
552
553static PyObject *
554pattern_match(PyObject *self, PyObject *args, PyObject *kwargs)
555{
556 PyObject *return_value = NULL;
557 static char *_keywords[] = {"pattern", "pos", "endpos", NULL};
558 PyObject *pattern;
559 Py_ssize_t pos = 0;
560 Py_ssize_t endpos = PY_SSIZE_T_MAX;
561
562 if (!PyArg_ParseTupleAndKeywords(args, kwargs,
563 "O|nn:match", _keywords,
564 &pattern, &pos, &endpos))
565 goto exit;
566 return_value = pattern_match_impl((PatternObject *)self, pattern, pos, endpos);
567
568exit:
569 return return_value;
570}
571
572static PyObject *
573pattern_match_impl(PatternObject *self, PyObject *pattern, Py_ssize_t pos, Py_ssize_t endpos)
Larry Hastingsdf7c22b2014-01-07 14:25:26 -0800574/*[clinic end generated code: checksum=63e59c5f3019efe6c1f3acdec42b2d3595e14a09]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000575{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000576 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100577 Py_ssize_t status;
Larry Hastings16c51912014-01-07 11:53:01 -0800578 PyObject *string;
Guido van Rossumb700df92000-03-31 14:59:30 +0000579
Larry Hastings16c51912014-01-07 11:53:01 -0800580 string = state_init(&state, (PatternObject *)self, pattern, pos, endpos);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000581 if (!string)
582 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000583
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000584 state.ptr = state.start;
585
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000586 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
587
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300588 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000589
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000590 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +0000591 if (PyErr_Occurred())
592 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000593
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000594 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000595
Larry Hastings16c51912014-01-07 11:53:01 -0800596 return (PyObject *)pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +0000597}
598
599static PyObject*
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200600pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
601{
602 SRE_STATE state;
603 Py_ssize_t status;
604
605 PyObject* string;
606 Py_ssize_t start = 0;
607 Py_ssize_t end = PY_SSIZE_T_MAX;
608 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
609 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:fullmatch", kwlist,
610 &string, &start, &end))
611 return NULL;
612
613 string = state_init(&state, self, string, start, end);
614 if (!string)
615 return NULL;
616
617 state.match_all = 1;
618 state.ptr = state.start;
619
620 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
621
622 status = sre_match(&state, PatternObject_GetCode(self));
623
624 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
625 if (PyErr_Occurred())
626 return NULL;
627
628 state_fini(&state);
629
630 return pattern_new_match(self, &state, status);
631}
632
633static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000634pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000635{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000636 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100637 Py_ssize_t status;
Guido van Rossumb700df92000-03-31 14:59:30 +0000638
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000639 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000640 Py_ssize_t start = 0;
641 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +0000642 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000643 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +0000644 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000645 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000646
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000647 string = state_init(&state, self, string, start, end);
648 if (!string)
649 return NULL;
650
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000651 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
652
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300653 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000654
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000655 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
656
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000657 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +0000658
Thomas Wouters89f507f2006-12-13 04:49:30 +0000659 if (PyErr_Occurred())
660 return NULL;
661
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000662 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +0000663}
664
665static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000666call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000667{
668 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000669 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000670 PyObject* func;
671 PyObject* result;
672
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000673 if (!args)
674 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000675 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000676 if (!name)
677 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000678 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000679 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000680 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000681 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000682 func = PyObject_GetAttrString(mod, function);
683 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000684 if (!func)
685 return NULL;
686 result = PyObject_CallObject(func, args);
687 Py_DECREF(func);
688 Py_DECREF(args);
689 return result;
690}
691
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000692#ifdef USE_BUILTIN_COPY
693static int
694deepcopy(PyObject** object, PyObject* memo)
695{
696 PyObject* copy;
697
698 copy = call(
699 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +0000700 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000701 );
702 if (!copy)
703 return 0;
704
705 Py_DECREF(*object);
706 *object = copy;
707
708 return 1; /* success */
709}
710#endif
711
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000712static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000713pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000714{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000715 SRE_STATE state;
716 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100717 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000718 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000720 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000721 Py_ssize_t start = 0;
722 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +0000723 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000724 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +0000725 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000726 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000727
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000728 string = state_init(&state, self, string, start, end);
729 if (!string)
730 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000731
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000732 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000733 if (!list) {
734 state_fini(&state);
735 return NULL;
736 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000737
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000738 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000739
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000740 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000741
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000742 state_reset(&state);
743
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000744 state.ptr = state.start;
745
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300746 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300747 if (PyErr_Occurred())
748 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000749
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000750 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000751 if (status == 0)
752 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000753 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000754 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000755 }
Tim Peters3d563502006-01-21 02:47:53 +0000756
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000757 /* don't bother to build a match object */
758 switch (self->groups) {
759 case 0:
760 b = STATE_OFFSET(&state, state.start);
761 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300762 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300763 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000764 if (!item)
765 goto error;
766 break;
767 case 1:
768 item = state_getslice(&state, 1, string, 1);
769 if (!item)
770 goto error;
771 break;
772 default:
773 item = PyTuple_New(self->groups);
774 if (!item)
775 goto error;
776 for (i = 0; i < self->groups; i++) {
777 PyObject* o = state_getslice(&state, i+1, string, 1);
778 if (!o) {
779 Py_DECREF(item);
780 goto error;
781 }
782 PyTuple_SET_ITEM(item, i, o);
783 }
784 break;
785 }
786
787 status = PyList_Append(list, item);
788 Py_DECREF(item);
789 if (status < 0)
790 goto error;
791
792 if (state.ptr == state.start)
793 state.start = (void*) ((char*) state.ptr + state.charsize);
794 else
795 state.start = state.ptr;
796
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000797 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000798
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000799 state_fini(&state);
800 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000801
802error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000803 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 state_fini(&state);
805 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000806
Guido van Rossumb700df92000-03-31 14:59:30 +0000807}
808
Fredrik Lundh703ce812001-10-24 22:16:30 +0000809static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600810pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +0000811{
812 PyObject* scanner;
813 PyObject* search;
814 PyObject* iterator;
815
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600816 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000817 if (!scanner)
818 return NULL;
819
820 search = PyObject_GetAttrString(scanner, "search");
821 Py_DECREF(scanner);
822 if (!search)
823 return NULL;
824
825 iterator = PyCallIter_New(search, Py_None);
826 Py_DECREF(search);
827
828 return iterator;
829}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000830
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000831static PyObject*
832pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
833{
834 SRE_STATE state;
835 PyObject* list;
836 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100837 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000838 Py_ssize_t n;
839 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000840 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000841
842 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000843 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +0000844 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000845 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000846 &string, &maxsplit))
847 return NULL;
848
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000849 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000850 if (!string)
851 return NULL;
852
853 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000854 if (!list) {
855 state_fini(&state);
856 return NULL;
857 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000858
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000859 n = 0;
860 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000861
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000862 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000863
864 state_reset(&state);
865
866 state.ptr = state.start;
867
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300868 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300869 if (PyErr_Occurred())
870 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000871
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000872 if (status <= 0) {
873 if (status == 0)
874 break;
875 pattern_error(status);
876 goto error;
877 }
Tim Peters3d563502006-01-21 02:47:53 +0000878
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000879 if (state.start == state.ptr) {
880 if (last == state.end)
881 break;
882 /* skip one character */
883 state.start = (void*) ((char*) state.ptr + state.charsize);
884 continue;
885 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000886
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000887 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300888 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000889 string, STATE_OFFSET(&state, last),
890 STATE_OFFSET(&state, state.start)
891 );
892 if (!item)
893 goto error;
894 status = PyList_Append(list, item);
895 Py_DECREF(item);
896 if (status < 0)
897 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000898
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000899 /* add groups (if any) */
900 for (i = 0; i < self->groups; i++) {
901 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000902 if (!item)
903 goto error;
904 status = PyList_Append(list, item);
905 Py_DECREF(item);
906 if (status < 0)
907 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000908 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000909
910 n = n + 1;
911
912 last = state.start = state.ptr;
913
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000914 }
915
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000916 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300917 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000918 string, STATE_OFFSET(&state, last), state.endpos
919 );
920 if (!item)
921 goto error;
922 status = PyList_Append(list, item);
923 Py_DECREF(item);
924 if (status < 0)
925 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000926
927 state_fini(&state);
928 return list;
929
930error:
931 Py_DECREF(list);
932 state_fini(&state);
933 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000934
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000935}
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000936
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000937static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000938pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000939 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000940{
941 SRE_STATE state;
942 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +0300943 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000944 PyObject* item;
945 PyObject* filter;
946 PyObject* args;
947 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000948 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100949 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000950 Py_ssize_t n;
951 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300952 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000953 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600954 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000955
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000956 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +0000957 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000958 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +0000959 Py_INCREF(filter);
960 filter_is_callable = 1;
961 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000962 /* if not callable, check if it's a literal string */
963 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600964 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300965 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000967 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300968 if (charsize == 1)
969 literal = memchr(ptr, '\\', n) == NULL;
970 else
971 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000972 } else {
973 PyErr_Clear();
974 literal = 0;
975 }
Benjamin Petersone48944b2012-03-07 14:50:25 -0600976 if (view.buf)
977 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000978 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000979 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000980 Py_INCREF(filter);
981 filter_is_callable = 0;
982 } else {
983 /* not a literal; hand it over to the template compiler */
984 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +0000985 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000986 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000987 );
988 if (!filter)
989 return NULL;
990 filter_is_callable = PyCallable_Check(filter);
991 }
Fredrik Lundhdac58492001-10-21 21:48:30 +0000992 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000993
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000994 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +0000995 if (!string) {
996 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000997 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +0000998 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000999
1000 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001001 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001002 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001003 state_fini(&state);
1004 return NULL;
1005 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001006
1007 n = i = 0;
1008
1009 while (!count || n < count) {
1010
1011 state_reset(&state);
1012
1013 state.ptr = state.start;
1014
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001015 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001016 if (PyErr_Occurred())
1017 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001018
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001019 if (status <= 0) {
1020 if (status == 0)
1021 break;
1022 pattern_error(status);
1023 goto error;
1024 }
Tim Peters3d563502006-01-21 02:47:53 +00001025
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001026 b = STATE_OFFSET(&state, state.start);
1027 e = STATE_OFFSET(&state, state.ptr);
1028
1029 if (i < b) {
1030 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001031 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001032 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001033 if (!item)
1034 goto error;
1035 status = PyList_Append(list, item);
1036 Py_DECREF(item);
1037 if (status < 0)
1038 goto error;
1039
1040 } else if (i == b && i == e && n > 0)
1041 /* ignore empty match on latest position */
1042 goto next;
1043
1044 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001045 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001046 match = pattern_new_match(self, &state, 1);
1047 if (!match)
1048 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001049 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001050 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00001051 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001052 goto error;
1053 }
1054 item = PyObject_CallObject(filter, args);
1055 Py_DECREF(args);
1056 Py_DECREF(match);
1057 if (!item)
1058 goto error;
1059 } else {
1060 /* filter is literal string */
1061 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001062 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001063 }
1064
1065 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001066 if (item != Py_None) {
1067 status = PyList_Append(list, item);
1068 Py_DECREF(item);
1069 if (status < 0)
1070 goto error;
1071 }
Tim Peters3d563502006-01-21 02:47:53 +00001072
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001073 i = e;
1074 n = n + 1;
1075
1076next:
1077 /* move on */
1078 if (state.ptr == state.start)
1079 state.start = (void*) ((char*) state.ptr + state.charsize);
1080 else
1081 state.start = state.ptr;
1082
1083 }
1084
1085 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001086 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001087 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001088 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001089 if (!item)
1090 goto error;
1091 status = PyList_Append(list, item);
1092 Py_DECREF(item);
1093 if (status < 0)
1094 goto error;
1095 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001096
1097 state_fini(&state);
1098
Guido van Rossum4e173842001-12-07 04:25:10 +00001099 Py_DECREF(filter);
1100
Fredrik Lundhdac58492001-10-21 21:48:30 +00001101 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001102 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001103 if (!joiner) {
1104 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001105 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001106 }
1107 if (PyList_GET_SIZE(list) == 0) {
1108 Py_DECREF(list);
1109 item = joiner;
1110 }
1111 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001112 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001113 item = _PyBytes_Join(joiner, list);
1114 else
1115 item = PyUnicode_Join(joiner, list);
1116 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001117 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001118 if (!item)
1119 return NULL;
1120 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001121
1122 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001123 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001124
1125 return item;
1126
1127error:
1128 Py_DECREF(list);
1129 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001130 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001131 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001132
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001133}
1134
1135static PyObject*
1136pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
1137{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001138 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001139 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001140 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001141 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001142 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001143 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001144 return NULL;
1145
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001146 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001147}
1148
1149static PyObject*
1150pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
1151{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001152 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001153 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001154 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001155 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001156 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001157 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001158 return NULL;
1159
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001160 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001161}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001162
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001163static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001164pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001165{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001166#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001167 PatternObject* copy;
1168 int offset;
1169
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001170 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1171 if (!copy)
1172 return NULL;
1173
1174 offset = offsetof(PatternObject, groups);
1175
1176 Py_XINCREF(self->groupindex);
1177 Py_XINCREF(self->indexgroup);
1178 Py_XINCREF(self->pattern);
1179
1180 memcpy((char*) copy + offset, (char*) self + offset,
1181 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00001182 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001183
1184 return (PyObject*) copy;
1185#else
1186 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1187 return NULL;
1188#endif
1189}
1190
1191static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001192pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001193{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001194#ifdef USE_BUILTIN_COPY
1195 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00001196
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001197 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001198 if (!copy)
1199 return NULL;
1200
1201 if (!deepcopy(&copy->groupindex, memo) ||
1202 !deepcopy(&copy->indexgroup, memo) ||
1203 !deepcopy(&copy->pattern, memo)) {
1204 Py_DECREF(copy);
1205 return NULL;
1206 }
1207
1208#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001209 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1210 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001211#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001212}
1213
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001214static PyObject *
1215pattern_repr(PatternObject *obj)
1216{
1217 static const struct {
1218 const char *name;
1219 int value;
1220 } flag_names[] = {
1221 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1222 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1223 {"re.LOCALE", SRE_FLAG_LOCALE},
1224 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1225 {"re.DOTALL", SRE_FLAG_DOTALL},
1226 {"re.UNICODE", SRE_FLAG_UNICODE},
1227 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1228 {"re.DEBUG", SRE_FLAG_DEBUG},
1229 {"re.ASCII", SRE_FLAG_ASCII},
1230 };
1231 PyObject *result = NULL;
1232 PyObject *flag_items;
1233 int i;
1234 int flags = obj->flags;
1235
1236 /* Omit re.UNICODE for valid string patterns. */
1237 if (obj->isbytes == 0 &&
1238 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1239 SRE_FLAG_UNICODE)
1240 flags &= ~SRE_FLAG_UNICODE;
1241
1242 flag_items = PyList_New(0);
1243 if (!flag_items)
1244 return NULL;
1245
1246 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1247 if (flags & flag_names[i].value) {
1248 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1249 if (!item)
1250 goto done;
1251
1252 if (PyList_Append(flag_items, item) < 0) {
1253 Py_DECREF(item);
1254 goto done;
1255 }
1256 Py_DECREF(item);
1257 flags &= ~flag_names[i].value;
1258 }
1259 }
1260 if (flags) {
1261 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1262 if (!item)
1263 goto done;
1264
1265 if (PyList_Append(flag_items, item) < 0) {
1266 Py_DECREF(item);
1267 goto done;
1268 }
1269 Py_DECREF(item);
1270 }
1271
1272 if (PyList_Size(flag_items) > 0) {
1273 PyObject *flags_result;
1274 PyObject *sep = PyUnicode_FromString("|");
1275 if (!sep)
1276 goto done;
1277 flags_result = PyUnicode_Join(sep, flag_items);
1278 Py_DECREF(sep);
1279 if (!flags_result)
1280 goto done;
1281 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1282 obj->pattern, flags_result);
1283 Py_DECREF(flags_result);
1284 }
1285 else {
1286 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1287 }
1288
1289done:
1290 Py_DECREF(flag_items);
1291 return result;
1292}
1293
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001294PyDoc_STRVAR(pattern_fullmatch_doc,
1295"fullmatch(string[, pos[, endpos]]) -> match object or None.\n\
1296 Matches against all of the string");
1297
Raymond Hettinger94478742004-09-24 04:31:19 +00001298PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001299"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001300 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02001301 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001302
1303PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001304"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001305 Split string by the occurrences of pattern.");
1306
1307PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001308"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001309 Return a list of all non-overlapping matches of pattern in string.");
1310
1311PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001312"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001313 Return an iterator over all non-overlapping matches for the \n\
1314 RE pattern in string. For each match, the iterator returns a\n\
1315 match object.");
1316
1317PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001318"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001319 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00001320 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001321
1322PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001323"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001324 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
1325 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00001326 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001327
1328PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
1329
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001330static PyMethodDef pattern_methods[] = {
Larry Hastings16c51912014-01-07 11:53:01 -08001331 PATTERN_MATCH_METHODDEF
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001332 {"fullmatch", (PyCFunction) pattern_fullmatch, METH_VARARGS|METH_KEYWORDS,
1333 pattern_fullmatch_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001334 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001335 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001336 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001337 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001338 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001339 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001340 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001341 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001342 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001343 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001344 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001345 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001346 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001347 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
1348 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001349 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001350};
1351
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00001352#define PAT_OFF(x) offsetof(PatternObject, x)
1353static PyMemberDef pattern_members[] = {
1354 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
1355 {"flags", T_INT, PAT_OFF(flags), READONLY},
1356 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
1357 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
1358 {NULL} /* Sentinel */
1359};
Guido van Rossumb700df92000-03-31 14:59:30 +00001360
Neal Norwitz57c179c2006-03-22 07:18:02 +00001361static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001362 PyVarObject_HEAD_INIT(NULL, 0)
1363 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001364 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001365 (destructor)pattern_dealloc, /* tp_dealloc */
1366 0, /* tp_print */
1367 0, /* tp_getattr */
1368 0, /* tp_setattr */
1369 0, /* tp_reserved */
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001370 (reprfunc)pattern_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001371 0, /* tp_as_number */
1372 0, /* tp_as_sequence */
1373 0, /* tp_as_mapping */
1374 0, /* tp_hash */
1375 0, /* tp_call */
1376 0, /* tp_str */
1377 0, /* tp_getattro */
1378 0, /* tp_setattro */
1379 0, /* tp_as_buffer */
1380 Py_TPFLAGS_DEFAULT, /* tp_flags */
1381 pattern_doc, /* tp_doc */
1382 0, /* tp_traverse */
1383 0, /* tp_clear */
1384 0, /* tp_richcompare */
1385 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
1386 0, /* tp_iter */
1387 0, /* tp_iternext */
1388 pattern_methods, /* tp_methods */
1389 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00001390};
1391
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001392static int _validate(PatternObject *self); /* Forward */
1393
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001394static PyObject *
1395_compile(PyObject* self_, PyObject* args)
1396{
1397 /* "compile" pattern descriptor to pattern object */
1398
1399 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001400 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001401
1402 PyObject* pattern;
1403 int flags = 0;
1404 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001405 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406 PyObject* groupindex = NULL;
1407 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001408
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001409 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410 &PyList_Type, &code, &groups,
1411 &groupindex, &indexgroup))
1412 return NULL;
1413
1414 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001415 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001416 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1417 if (!self)
1418 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001419 self->weakreflist = NULL;
1420 self->pattern = NULL;
1421 self->groupindex = NULL;
1422 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423
1424 self->codesize = n;
1425
1426 for (i = 0; i < n; i++) {
1427 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001428 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429 self->code[i] = (SRE_CODE) value;
1430 if ((unsigned long) self->code[i] != value) {
1431 PyErr_SetString(PyExc_OverflowError,
1432 "regular expression code size limit exceeded");
1433 break;
1434 }
1435 }
1436
1437 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001438 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001439 return NULL;
1440 }
1441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001443 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 else {
1446 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001447 int charsize;
1448 Py_buffer view;
1449 view.buf = NULL;
1450 if (!getstring(pattern, &p_length, &self->isbytes,
1451 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 Py_DECREF(self);
1453 return NULL;
1454 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001455 if (view.buf)
1456 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001458
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459 Py_INCREF(pattern);
1460 self->pattern = pattern;
1461
1462 self->flags = flags;
1463
1464 self->groups = groups;
1465
1466 Py_XINCREF(groupindex);
1467 self->groupindex = groupindex;
1468
1469 Py_XINCREF(indexgroup);
1470 self->indexgroup = indexgroup;
1471
1472 self->weakreflist = NULL;
1473
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001474 if (!_validate(self)) {
1475 Py_DECREF(self);
1476 return NULL;
1477 }
1478
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479 return (PyObject*) self;
1480}
1481
Guido van Rossumb700df92000-03-31 14:59:30 +00001482/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001483/* Code validation */
1484
1485/* To learn more about this code, have a look at the _compile() function in
1486 Lib/sre_compile.py. The validation functions below checks the code array
1487 for conformance with the code patterns generated there.
1488
1489 The nice thing about the generated code is that it is position-independent:
1490 all jumps are relative jumps forward. Also, jumps don't cross each other:
1491 the target of a later jump is always earlier than the target of an earlier
1492 jump. IOW, this is okay:
1493
1494 J---------J-------T--------T
1495 \ \_____/ /
1496 \______________________/
1497
1498 but this is not:
1499
1500 J---------J-------T--------T
1501 \_________\_____/ /
1502 \____________/
1503
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001504 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001505*/
1506
1507/* Defining this one enables tracing of the validator */
1508#undef VVERBOSE
1509
1510/* Trace macro for the validator */
1511#if defined(VVERBOSE)
1512#define VTRACE(v) printf v
1513#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001514#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001515#endif
1516
1517/* Report failure */
1518#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1519
1520/* Extract opcode, argument, or skip count from code array */
1521#define GET_OP \
1522 do { \
1523 VTRACE(("%p: ", code)); \
1524 if (code >= end) FAIL; \
1525 op = *code++; \
1526 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1527 } while (0)
1528#define GET_ARG \
1529 do { \
1530 VTRACE(("%p= ", code)); \
1531 if (code >= end) FAIL; \
1532 arg = *code++; \
1533 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1534 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001535#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001536 do { \
1537 VTRACE(("%p= ", code)); \
1538 if (code >= end) FAIL; \
1539 skip = *code; \
1540 VTRACE(("%lu (skip to %p)\n", \
1541 (unsigned long)skip, code+skip)); \
Victor Stinner1fa174a2013-08-28 02:06:21 +02001542 if (skip-adj > (Py_uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001543 FAIL; \
1544 code++; \
1545 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001546#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001547
1548static int
1549_validate_charset(SRE_CODE *code, SRE_CODE *end)
1550{
1551 /* Some variables are manipulated by the macros above */
1552 SRE_CODE op;
1553 SRE_CODE arg;
1554 SRE_CODE offset;
1555 int i;
1556
1557 while (code < end) {
1558 GET_OP;
1559 switch (op) {
1560
1561 case SRE_OP_NEGATE:
1562 break;
1563
1564 case SRE_OP_LITERAL:
1565 GET_ARG;
1566 break;
1567
1568 case SRE_OP_RANGE:
1569 GET_ARG;
1570 GET_ARG;
1571 break;
1572
1573 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001574 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001575 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001576 FAIL;
1577 code += offset;
1578 break;
1579
1580 case SRE_OP_BIGCHARSET:
1581 GET_ARG; /* Number of blocks */
1582 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001583 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001584 FAIL;
1585 /* Make sure that each byte points to a valid block */
1586 for (i = 0; i < 256; i++) {
1587 if (((unsigned char *)code)[i] >= arg)
1588 FAIL;
1589 }
1590 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001591 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001592 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001593 FAIL;
1594 code += offset;
1595 break;
1596
1597 case SRE_OP_CATEGORY:
1598 GET_ARG;
1599 switch (arg) {
1600 case SRE_CATEGORY_DIGIT:
1601 case SRE_CATEGORY_NOT_DIGIT:
1602 case SRE_CATEGORY_SPACE:
1603 case SRE_CATEGORY_NOT_SPACE:
1604 case SRE_CATEGORY_WORD:
1605 case SRE_CATEGORY_NOT_WORD:
1606 case SRE_CATEGORY_LINEBREAK:
1607 case SRE_CATEGORY_NOT_LINEBREAK:
1608 case SRE_CATEGORY_LOC_WORD:
1609 case SRE_CATEGORY_LOC_NOT_WORD:
1610 case SRE_CATEGORY_UNI_DIGIT:
1611 case SRE_CATEGORY_UNI_NOT_DIGIT:
1612 case SRE_CATEGORY_UNI_SPACE:
1613 case SRE_CATEGORY_UNI_NOT_SPACE:
1614 case SRE_CATEGORY_UNI_WORD:
1615 case SRE_CATEGORY_UNI_NOT_WORD:
1616 case SRE_CATEGORY_UNI_LINEBREAK:
1617 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1618 break;
1619 default:
1620 FAIL;
1621 }
1622 break;
1623
1624 default:
1625 FAIL;
1626
1627 }
1628 }
1629
1630 return 1;
1631}
1632
1633static int
1634_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1635{
1636 /* Some variables are manipulated by the macros above */
1637 SRE_CODE op;
1638 SRE_CODE arg;
1639 SRE_CODE skip;
1640
1641 VTRACE(("code=%p, end=%p\n", code, end));
1642
1643 if (code > end)
1644 FAIL;
1645
1646 while (code < end) {
1647 GET_OP;
1648 switch (op) {
1649
1650 case SRE_OP_MARK:
1651 /* We don't check whether marks are properly nested; the
1652 sre_match() code is robust even if they don't, and the worst
1653 you can get is nonsensical match results. */
1654 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001655 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001656 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1657 FAIL;
1658 }
1659 break;
1660
1661 case SRE_OP_LITERAL:
1662 case SRE_OP_NOT_LITERAL:
1663 case SRE_OP_LITERAL_IGNORE:
1664 case SRE_OP_NOT_LITERAL_IGNORE:
1665 GET_ARG;
1666 /* The arg is just a character, nothing to check */
1667 break;
1668
1669 case SRE_OP_SUCCESS:
1670 case SRE_OP_FAILURE:
1671 /* Nothing to check; these normally end the matching process */
1672 break;
1673
1674 case SRE_OP_AT:
1675 GET_ARG;
1676 switch (arg) {
1677 case SRE_AT_BEGINNING:
1678 case SRE_AT_BEGINNING_STRING:
1679 case SRE_AT_BEGINNING_LINE:
1680 case SRE_AT_END:
1681 case SRE_AT_END_LINE:
1682 case SRE_AT_END_STRING:
1683 case SRE_AT_BOUNDARY:
1684 case SRE_AT_NON_BOUNDARY:
1685 case SRE_AT_LOC_BOUNDARY:
1686 case SRE_AT_LOC_NON_BOUNDARY:
1687 case SRE_AT_UNI_BOUNDARY:
1688 case SRE_AT_UNI_NON_BOUNDARY:
1689 break;
1690 default:
1691 FAIL;
1692 }
1693 break;
1694
1695 case SRE_OP_ANY:
1696 case SRE_OP_ANY_ALL:
1697 /* These have no operands */
1698 break;
1699
1700 case SRE_OP_IN:
1701 case SRE_OP_IN_IGNORE:
1702 GET_SKIP;
1703 /* Stop 1 before the end; we check the FAILURE below */
1704 if (!_validate_charset(code, code+skip-2))
1705 FAIL;
1706 if (code[skip-2] != SRE_OP_FAILURE)
1707 FAIL;
1708 code += skip-1;
1709 break;
1710
1711 case SRE_OP_INFO:
1712 {
1713 /* A minimal info field is
1714 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1715 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1716 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001717 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001718 SRE_CODE *newcode;
1719 GET_SKIP;
1720 newcode = code+skip-1;
1721 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001722 GET_ARG;
1723 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001724 /* Check that only valid flags are present */
1725 if ((flags & ~(SRE_INFO_PREFIX |
1726 SRE_INFO_LITERAL |
1727 SRE_INFO_CHARSET)) != 0)
1728 FAIL;
1729 /* PREFIX and CHARSET are mutually exclusive */
1730 if ((flags & SRE_INFO_PREFIX) &&
1731 (flags & SRE_INFO_CHARSET))
1732 FAIL;
1733 /* LITERAL implies PREFIX */
1734 if ((flags & SRE_INFO_LITERAL) &&
1735 !(flags & SRE_INFO_PREFIX))
1736 FAIL;
1737 /* Validate the prefix */
1738 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001739 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001740 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001741 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001742 /* Here comes the prefix string */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001743 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001744 FAIL;
1745 code += prefix_len;
1746 /* And here comes the overlap table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001747 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001748 FAIL;
1749 /* Each overlap value should be < prefix_len */
1750 for (i = 0; i < prefix_len; i++) {
1751 if (code[i] >= prefix_len)
1752 FAIL;
1753 }
1754 code += prefix_len;
1755 }
1756 /* Validate the charset */
1757 if (flags & SRE_INFO_CHARSET) {
1758 if (!_validate_charset(code, newcode-1))
1759 FAIL;
1760 if (newcode[-1] != SRE_OP_FAILURE)
1761 FAIL;
1762 code = newcode;
1763 }
1764 else if (code != newcode) {
1765 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1766 FAIL;
1767 }
1768 }
1769 break;
1770
1771 case SRE_OP_BRANCH:
1772 {
1773 SRE_CODE *target = NULL;
1774 for (;;) {
1775 GET_SKIP;
1776 if (skip == 0)
1777 break;
1778 /* Stop 2 before the end; we check the JUMP below */
1779 if (!_validate_inner(code, code+skip-3, groups))
1780 FAIL;
1781 code += skip-3;
1782 /* Check that it ends with a JUMP, and that each JUMP
1783 has the same target */
1784 GET_OP;
1785 if (op != SRE_OP_JUMP)
1786 FAIL;
1787 GET_SKIP;
1788 if (target == NULL)
1789 target = code+skip-1;
1790 else if (code+skip-1 != target)
1791 FAIL;
1792 }
1793 }
1794 break;
1795
1796 case SRE_OP_REPEAT_ONE:
1797 case SRE_OP_MIN_REPEAT_ONE:
1798 {
1799 SRE_CODE min, max;
1800 GET_SKIP;
1801 GET_ARG; min = arg;
1802 GET_ARG; max = arg;
1803 if (min > max)
1804 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001805 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001806 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001807 if (!_validate_inner(code, code+skip-4, groups))
1808 FAIL;
1809 code += skip-4;
1810 GET_OP;
1811 if (op != SRE_OP_SUCCESS)
1812 FAIL;
1813 }
1814 break;
1815
1816 case SRE_OP_REPEAT:
1817 {
1818 SRE_CODE min, max;
1819 GET_SKIP;
1820 GET_ARG; min = arg;
1821 GET_ARG; max = arg;
1822 if (min > max)
1823 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001824 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001825 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001826 if (!_validate_inner(code, code+skip-3, groups))
1827 FAIL;
1828 code += skip-3;
1829 GET_OP;
1830 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1831 FAIL;
1832 }
1833 break;
1834
1835 case SRE_OP_GROUPREF:
1836 case SRE_OP_GROUPREF_IGNORE:
1837 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001838 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001839 FAIL;
1840 break;
1841
1842 case SRE_OP_GROUPREF_EXISTS:
1843 /* The regex syntax for this is: '(?(group)then|else)', where
1844 'group' is either an integer group number or a group name,
1845 'then' and 'else' are sub-regexes, and 'else' is optional. */
1846 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001847 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001848 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001849 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001850 code--; /* The skip is relative to the first arg! */
1851 /* There are two possibilities here: if there is both a 'then'
1852 part and an 'else' part, the generated code looks like:
1853
1854 GROUPREF_EXISTS
1855 <group>
1856 <skipyes>
1857 ...then part...
1858 JUMP
1859 <skipno>
1860 (<skipyes> jumps here)
1861 ...else part...
1862 (<skipno> jumps here)
1863
1864 If there is only a 'then' part, it looks like:
1865
1866 GROUPREF_EXISTS
1867 <group>
1868 <skip>
1869 ...then part...
1870 (<skip> jumps here)
1871
1872 There is no direct way to decide which it is, and we don't want
1873 to allow arbitrary jumps anywhere in the code; so we just look
1874 for a JUMP opcode preceding our skip target.
1875 */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001876 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001877 code[skip-3] == SRE_OP_JUMP)
1878 {
1879 VTRACE(("both then and else parts present\n"));
1880 if (!_validate_inner(code+1, code+skip-3, groups))
1881 FAIL;
1882 code += skip-2; /* Position after JUMP, at <skipno> */
1883 GET_SKIP;
1884 if (!_validate_inner(code, code+skip-1, groups))
1885 FAIL;
1886 code += skip-1;
1887 }
1888 else {
1889 VTRACE(("only a then part present\n"));
1890 if (!_validate_inner(code+1, code+skip-1, groups))
1891 FAIL;
1892 code += skip-1;
1893 }
1894 break;
1895
1896 case SRE_OP_ASSERT:
1897 case SRE_OP_ASSERT_NOT:
1898 GET_SKIP;
1899 GET_ARG; /* 0 for lookahead, width for lookbehind */
1900 code--; /* Back up over arg to simplify math below */
1901 if (arg & 0x80000000)
1902 FAIL; /* Width too large */
1903 /* Stop 1 before the end; we check the SUCCESS below */
1904 if (!_validate_inner(code+1, code+skip-2, groups))
1905 FAIL;
1906 code += skip-2;
1907 GET_OP;
1908 if (op != SRE_OP_SUCCESS)
1909 FAIL;
1910 break;
1911
1912 default:
1913 FAIL;
1914
1915 }
1916 }
1917
1918 VTRACE(("okay\n"));
1919 return 1;
1920}
1921
1922static int
1923_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1924{
1925 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
1926 FAIL;
1927 if (groups == 0) /* fix for simplejson */
1928 groups = 100; /* 100 groups should always be safe */
1929 return _validate_inner(code, end-1, groups);
1930}
1931
1932static int
1933_validate(PatternObject *self)
1934{
1935 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1936 {
1937 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1938 return 0;
1939 }
1940 else
1941 VTRACE(("Success!\n"));
1942 return 1;
1943}
1944
1945/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001946/* match methods */
1947
1948static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001949match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001950{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001951 Py_XDECREF(self->regs);
1952 Py_XDECREF(self->string);
1953 Py_DECREF(self->pattern);
1954 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001955}
1956
1957static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001958match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001959{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001960 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001961 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001962 Py_buffer view;
1963 PyObject *result;
1964 void* ptr;
1965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001966 if (index < 0 || index >= self->groups) {
1967 /* raise IndexError if we were given a bad group number */
1968 PyErr_SetString(
1969 PyExc_IndexError,
1970 "no such group"
1971 );
1972 return NULL;
1973 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001974
Fredrik Lundh6f013982000-07-03 18:44:21 +00001975 index *= 2;
1976
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001977 if (self->string == Py_None || self->mark[index] < 0) {
1978 /* return default value if the string or group is undefined */
1979 Py_INCREF(def);
1980 return def;
1981 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001982
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001983 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001984 if (ptr == NULL)
1985 return NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001986 result = getslice(isbytes, ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001987 self->string, self->mark[index], self->mark[index+1]);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001988 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001989 PyBuffer_Release(&view);
1990 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001991}
1992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001993static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001994match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001996 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001997
Guido van Rossumddefaf32007-01-14 03:31:43 +00001998 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001999 /* Default value */
2000 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00002001
Christian Heimes217cfd12007-12-02 14:31:20 +00002002 if (PyLong_Check(index))
2003 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002004
Fredrik Lundh6f013982000-07-03 18:44:21 +00002005 i = -1;
2006
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 if (self->pattern->groupindex) {
2008 index = PyObject_GetItem(self->pattern->groupindex, index);
2009 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00002010 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00002011 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002012 Py_DECREF(index);
2013 } else
2014 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002015 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002016
2017 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002018}
2019
2020static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002021match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002022{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002023 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002024}
2025
2026static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002027match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002028{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002029 /* delegate to Python code */
2030 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002031 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002032 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002033 );
2034}
2035
2036static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002037match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002038{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002039 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002040 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002041
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002043
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 switch (size) {
2045 case 0:
2046 result = match_getslice(self, Py_False, Py_None);
2047 break;
2048 case 1:
2049 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2050 break;
2051 default:
2052 /* fetch multiple items */
2053 result = PyTuple_New(size);
2054 if (!result)
2055 return NULL;
2056 for (i = 0; i < size; i++) {
2057 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002058 self, PyTuple_GET_ITEM(args, i), Py_None
2059 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 if (!item) {
2061 Py_DECREF(result);
2062 return NULL;
2063 }
2064 PyTuple_SET_ITEM(result, i, item);
2065 }
2066 break;
2067 }
2068 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002069}
2070
2071static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002072match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002073{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002075 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002076
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002077 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002078 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002079 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002080 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002081
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 result = PyTuple_New(self->groups-1);
2083 if (!result)
2084 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002085
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 for (index = 1; index < self->groups; index++) {
2087 PyObject* item;
2088 item = match_getslice_by_index(self, index, def);
2089 if (!item) {
2090 Py_DECREF(result);
2091 return NULL;
2092 }
2093 PyTuple_SET_ITEM(result, index-1, item);
2094 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002095
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002096 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002097}
2098
2099static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002100match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002101{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 PyObject* result;
2103 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002104 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002105
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002106 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002107 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002108 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002109 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002110
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002111 result = PyDict_New();
2112 if (!result || !self->pattern->groupindex)
2113 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002114
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002115 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002116 if (!keys)
2117 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002119 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002120 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002121 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002122 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002123 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002124 if (!key)
2125 goto failed;
2126 value = match_getslice(self, key, def);
2127 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002129 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002130 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002131 status = PyDict_SetItem(result, key, value);
2132 Py_DECREF(value);
2133 if (status < 0)
2134 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002136
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002137 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002138
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002139 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002140
2141failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002142 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002143 Py_DECREF(result);
2144 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002145}
2146
2147static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002148match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002149{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002150 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002151
Guido van Rossumddefaf32007-01-14 03:31:43 +00002152 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002153 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002154 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002155
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002156 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002158 if (index < 0 || index >= self->groups) {
2159 PyErr_SetString(
2160 PyExc_IndexError,
2161 "no such group"
2162 );
2163 return NULL;
2164 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002165
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002166 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002167 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002168}
2169
2170static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002171match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002172{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002173 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002174
Guido van Rossumddefaf32007-01-14 03:31:43 +00002175 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002176 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002177 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002178
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002179 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002180
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002181 if (index < 0 || index >= self->groups) {
2182 PyErr_SetString(
2183 PyExc_IndexError,
2184 "no such group"
2185 );
2186 return NULL;
2187 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002188
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002189 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002190 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002191}
2192
2193LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002194_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002195{
2196 PyObject* pair;
2197 PyObject* item;
2198
2199 pair = PyTuple_New(2);
2200 if (!pair)
2201 return NULL;
2202
Christian Heimes217cfd12007-12-02 14:31:20 +00002203 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204 if (!item)
2205 goto error;
2206 PyTuple_SET_ITEM(pair, 0, item);
2207
Christian Heimes217cfd12007-12-02 14:31:20 +00002208 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002209 if (!item)
2210 goto error;
2211 PyTuple_SET_ITEM(pair, 1, item);
2212
2213 return pair;
2214
2215 error:
2216 Py_DECREF(pair);
2217 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002218}
2219
2220static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002221match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002222{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002223 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002224
Guido van Rossumddefaf32007-01-14 03:31:43 +00002225 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002226 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002227 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002228
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002229 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002230
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002231 if (index < 0 || index >= self->groups) {
2232 PyErr_SetString(
2233 PyExc_IndexError,
2234 "no such group"
2235 );
2236 return NULL;
2237 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002238
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002239 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002240 return _pair(self->mark[index*2], self->mark[index*2+1]);
2241}
2242
2243static PyObject*
2244match_regs(MatchObject* self)
2245{
2246 PyObject* regs;
2247 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002248 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002249
2250 regs = PyTuple_New(self->groups);
2251 if (!regs)
2252 return NULL;
2253
2254 for (index = 0; index < self->groups; index++) {
2255 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2256 if (!item) {
2257 Py_DECREF(regs);
2258 return NULL;
2259 }
2260 PyTuple_SET_ITEM(regs, index, item);
2261 }
2262
2263 Py_INCREF(regs);
2264 self->regs = regs;
2265
2266 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002267}
2268
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002269static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002270match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002271{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002272#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002273 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002274 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00002275
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002276 slots = 2 * (self->pattern->groups+1);
2277
2278 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2279 if (!copy)
2280 return NULL;
2281
2282 /* this value a constant, but any compiler should be able to
2283 figure that out all by itself */
2284 offset = offsetof(MatchObject, string);
2285
2286 Py_XINCREF(self->pattern);
2287 Py_XINCREF(self->string);
2288 Py_XINCREF(self->regs);
2289
2290 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002291 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002292
2293 return (PyObject*) copy;
2294#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002295 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002296 return NULL;
2297#endif
2298}
2299
2300static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002301match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002302{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002303#ifdef USE_BUILTIN_COPY
2304 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002305
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002306 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002307 if (!copy)
2308 return NULL;
2309
2310 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2311 !deepcopy(&copy->string, memo) ||
2312 !deepcopy(&copy->regs, memo)) {
2313 Py_DECREF(copy);
2314 return NULL;
2315 }
2316
2317#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002318 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2319 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002320#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002321}
2322
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002323PyDoc_STRVAR(match_doc,
2324"The result of re.match() and re.search().\n\
2325Match objects always have a boolean value of True.");
2326
2327PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002328"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002329 Return subgroup(s) of the match by indices or names.\n\
2330 For 0 returns the entire match.");
2331
2332PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002333"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002334 Return index of the start of the substring matched by group.");
2335
2336PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002337"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002338 Return index of the end of the substring matched by group.");
2339
2340PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002341"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002342 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
2343
2344PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002345"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002346 Return a tuple containing all the subgroups of the match, from 1.\n\
2347 The default argument is used for groups\n\
2348 that did not participate in the match");
2349
2350PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002351"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002352 Return a dictionary containing all the named subgroups of the match,\n\
2353 keyed by the subgroup name. The default argument is used for groups\n\
2354 that did not participate in the match");
2355
2356PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002357"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002358 Return the string obtained by doing backslash substitution\n\
2359 on the string template, as done by the sub() method.");
2360
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002361static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002362 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2363 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
2364 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
2365 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
2366 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
2367 match_groups_doc},
2368 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
2369 match_groupdict_doc},
2370 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002371 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
2372 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002373 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002374};
2375
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002376static PyObject *
2377match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002378{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002379 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002380 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002381 Py_INCREF(Py_None);
2382 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00002383}
2384
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002385static PyObject *
2386match_lastgroup_get(MatchObject *self)
2387{
2388 if (self->pattern->indexgroup && self->lastindex >= 0) {
2389 PyObject* result = PySequence_GetItem(
2390 self->pattern->indexgroup, self->lastindex
2391 );
2392 if (result)
2393 return result;
2394 PyErr_Clear();
2395 }
2396 Py_INCREF(Py_None);
2397 return Py_None;
2398}
2399
2400static PyObject *
2401match_regs_get(MatchObject *self)
2402{
2403 if (self->regs) {
2404 Py_INCREF(self->regs);
2405 return self->regs;
2406 } else
2407 return match_regs(self);
2408}
2409
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002410static PyObject *
2411match_repr(MatchObject *self)
2412{
2413 PyObject *result;
2414 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2415 if (group0 == NULL)
2416 return NULL;
2417 result = PyUnicode_FromFormat(
2418 "<%s object; span=(%d, %d), match=%.50R>",
2419 Py_TYPE(self)->tp_name,
2420 self->mark[0], self->mark[1], group0);
2421 Py_DECREF(group0);
2422 return result;
2423}
2424
2425
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002426static PyGetSetDef match_getset[] = {
2427 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
2428 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
2429 {"regs", (getter)match_regs_get, (setter)NULL},
2430 {NULL}
2431};
2432
2433#define MATCH_OFF(x) offsetof(MatchObject, x)
2434static PyMemberDef match_members[] = {
2435 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
2436 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
2437 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
2438 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
2439 {NULL}
2440};
2441
Guido van Rossumb700df92000-03-31 14:59:30 +00002442/* FIXME: implement setattr("string", None) as a special case (to
2443 detach the associated string, if any */
2444
Neal Norwitz57c179c2006-03-22 07:18:02 +00002445static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002446 PyVarObject_HEAD_INIT(NULL,0)
2447 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002448 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002449 (destructor)match_dealloc, /* tp_dealloc */
2450 0, /* tp_print */
2451 0, /* tp_getattr */
2452 0, /* tp_setattr */
2453 0, /* tp_reserved */
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002454 (reprfunc)match_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002455 0, /* tp_as_number */
2456 0, /* tp_as_sequence */
2457 0, /* tp_as_mapping */
2458 0, /* tp_hash */
2459 0, /* tp_call */
2460 0, /* tp_str */
2461 0, /* tp_getattro */
2462 0, /* tp_setattro */
2463 0, /* tp_as_buffer */
2464 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002465 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002466 0, /* tp_traverse */
2467 0, /* tp_clear */
2468 0, /* tp_richcompare */
2469 0, /* tp_weaklistoffset */
2470 0, /* tp_iter */
2471 0, /* tp_iternext */
2472 match_methods, /* tp_methods */
2473 match_members, /* tp_members */
2474 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002475};
2476
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002477static PyObject*
Victor Stinnerf5587782013-11-15 23:21:11 +01002478pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002479{
2480 /* create match object (from state object) */
2481
2482 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002483 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 char* base;
2485 int n;
2486
2487 if (status > 0) {
2488
2489 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002490 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002491 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
2492 2*(pattern->groups+1));
2493 if (!match)
2494 return NULL;
2495
2496 Py_INCREF(pattern);
2497 match->pattern = pattern;
2498
2499 Py_INCREF(state->string);
2500 match->string = state->string;
2501
2502 match->regs = NULL;
2503 match->groups = pattern->groups+1;
2504
2505 /* fill in group slices */
2506
2507 base = (char*) state->beginning;
2508 n = state->charsize;
2509
2510 match->mark[0] = ((char*) state->start - base) / n;
2511 match->mark[1] = ((char*) state->ptr - base) / n;
2512
2513 for (i = j = 0; i < pattern->groups; i++, j+=2)
2514 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2515 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2516 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2517 } else
2518 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2519
2520 match->pos = state->pos;
2521 match->endpos = state->endpos;
2522
2523 match->lastindex = state->lastindex;
2524
2525 return (PyObject*) match;
2526
2527 } else if (status == 0) {
2528
2529 /* no match */
2530 Py_INCREF(Py_None);
2531 return Py_None;
2532
2533 }
2534
2535 /* internal error */
2536 pattern_error(status);
2537 return NULL;
2538}
2539
2540
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002541/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002542/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002543
2544static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002545scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002546{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002547 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002548 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002549 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002550}
2551
2552static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002553scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002554{
2555 SRE_STATE* state = &self->state;
2556 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002557 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002558
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002559 state_reset(state);
2560
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002561 state->ptr = state->start;
2562
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002563 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002564 if (PyErr_Occurred())
2565 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002566
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002567 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002568 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002569
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00002570 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002571 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002572 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002573 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002574
2575 return match;
2576}
2577
2578
2579static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002580scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002581{
2582 SRE_STATE* state = &self->state;
2583 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002584 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002585
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002586 state_reset(state);
2587
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002588 state->ptr = state->start;
2589
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002590 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002591 if (PyErr_Occurred())
2592 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002593
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002594 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002595 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002596
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00002597 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002598 state->start = (void*) ((char*) state->ptr + state->charsize);
2599 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002600 state->start = state->ptr;
2601
2602 return match;
2603}
2604
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002605static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002606 {"match", (PyCFunction) scanner_match, METH_NOARGS},
2607 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002609};
2610
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002611#define SCAN_OFF(x) offsetof(ScannerObject, x)
2612static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03002613 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002614 {NULL} /* Sentinel */
2615};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002616
Neal Norwitz57c179c2006-03-22 07:18:02 +00002617static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002618 PyVarObject_HEAD_INIT(NULL, 0)
2619 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002620 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002621 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002622 0, /* tp_print */
2623 0, /* tp_getattr */
2624 0, /* tp_setattr */
2625 0, /* tp_reserved */
2626 0, /* tp_repr */
2627 0, /* tp_as_number */
2628 0, /* tp_as_sequence */
2629 0, /* tp_as_mapping */
2630 0, /* tp_hash */
2631 0, /* tp_call */
2632 0, /* tp_str */
2633 0, /* tp_getattro */
2634 0, /* tp_setattro */
2635 0, /* tp_as_buffer */
2636 Py_TPFLAGS_DEFAULT, /* tp_flags */
2637 0, /* tp_doc */
2638 0, /* tp_traverse */
2639 0, /* tp_clear */
2640 0, /* tp_richcompare */
2641 0, /* tp_weaklistoffset */
2642 0, /* tp_iter */
2643 0, /* tp_iternext */
2644 scanner_methods, /* tp_methods */
2645 scanner_members, /* tp_members */
2646 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002647};
2648
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002649static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002650pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002651{
2652 /* create search state object */
2653
2654 ScannerObject* self;
2655
2656 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002657 Py_ssize_t start = 0;
2658 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002659 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2660 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
2661 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002662 return NULL;
2663
2664 /* create scanner object */
2665 self = PyObject_NEW(ScannerObject, &Scanner_Type);
2666 if (!self)
2667 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002668 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002669
2670 string = state_init(&self->state, pattern, string, start, end);
2671 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002672 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002673 return NULL;
2674 }
2675
2676 Py_INCREF(pattern);
2677 self->pattern = (PyObject*) pattern;
2678
2679 return (PyObject*) self;
2680}
2681
Guido van Rossumb700df92000-03-31 14:59:30 +00002682static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00002683 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002684 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00002685 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002686 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002687};
2688
Martin v. Löwis1a214512008-06-11 05:26:20 +00002689static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002690 PyModuleDef_HEAD_INIT,
2691 "_" SRE_MODULE,
2692 NULL,
2693 -1,
2694 _functions,
2695 NULL,
2696 NULL,
2697 NULL,
2698 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00002699};
2700
2701PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002702{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002703 PyObject* m;
2704 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002705 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002706
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00002707 /* Patch object types */
2708 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
2709 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00002710 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002711
Martin v. Löwis1a214512008-06-11 05:26:20 +00002712 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00002713 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002714 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002715 d = PyModule_GetDict(m);
2716
Christian Heimes217cfd12007-12-02 14:31:20 +00002717 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002718 if (x) {
2719 PyDict_SetItemString(d, "MAGIC", x);
2720 Py_DECREF(x);
2721 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002722
Christian Heimes217cfd12007-12-02 14:31:20 +00002723 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00002724 if (x) {
2725 PyDict_SetItemString(d, "CODESIZE", x);
2726 Py_DECREF(x);
2727 }
2728
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02002729 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
2730 if (x) {
2731 PyDict_SetItemString(d, "MAXREPEAT", x);
2732 Py_DECREF(x);
2733 }
2734
Neal Norwitzfe537132007-08-26 03:55:15 +00002735 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002736 if (x) {
2737 PyDict_SetItemString(d, "copyright", x);
2738 Py_DECREF(x);
2739 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00002740 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00002741}
2742
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002743/* vim:ts=4:sw=4:et
2744*/