blob: 0dc5212e45fb80ee1ac699297ac24e8a6fa9d9cc [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000038static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000044#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000045
46#include "sre.h"
47
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030048#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
49
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000050#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000051
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000053#if !defined(SRE_MODULE)
54#define SRE_MODULE "sre"
55#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000056
Thomas Wouters9ada3d62006-04-21 09:47:09 +000057#define SRE_PY_MODULE "re"
58
Guido van Rossumb700df92000-03-31 14:59:30 +000059/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000060#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000063/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064
65/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000066#define USE_FAST_SEARCH
67
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000068/* enables copy/deepcopy handling (work in progress) */
69#undef USE_BUILTIN_COPY
70
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071/* -------------------------------------------------------------------- */
72
Fredrik Lundh80946112000-06-29 18:03:25 +000073#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000074#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000075#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000076/* fastest possible local call under MSVC */
77#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000078#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000079#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#else
81#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000082#endif
83
84/* error codes */
85#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000086#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000087#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000088#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000089#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000090
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000091#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000092#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000093#else
94#define TRACE(v)
95#endif
96
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000097/* -------------------------------------------------------------------- */
98/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000100#define SRE_IS_DIGIT(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300101 ((ch) < 128 && Py_ISDIGIT(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000102#define SRE_IS_SPACE(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300103 ((ch) < 128 && Py_ISSPACE(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000104#define SRE_IS_LINEBREAK(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300105 ((ch) == '\n')
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106#define SRE_IS_ALNUM(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300107 ((ch) < 128 && Py_ISALNUM(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000108#define SRE_IS_WORD(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300109 ((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_'))
Guido van Rossumb700df92000-03-31 14:59:30 +0000110
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000111static unsigned int sre_lower(unsigned int ch)
112{
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300113 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000114}
115
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000116/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000117/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
118 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000119#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000120#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
121
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000122static unsigned int sre_lower_locale(unsigned int ch)
123{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000124 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000125}
126
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000127/* unicode-specific character predicates */
128
Victor Stinner0058b862011-09-29 03:27:47 +0200129#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
130#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
131#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
132#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
133#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000134
135static unsigned int sre_lower_unicode(unsigned int ch)
136{
Victor Stinner0058b862011-09-29 03:27:47 +0200137 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000138}
139
Guido van Rossumb700df92000-03-31 14:59:30 +0000140LOCAL(int)
141sre_category(SRE_CODE category, unsigned int ch)
142{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000143 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000144
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000145 case SRE_CATEGORY_DIGIT:
146 return SRE_IS_DIGIT(ch);
147 case SRE_CATEGORY_NOT_DIGIT:
148 return !SRE_IS_DIGIT(ch);
149 case SRE_CATEGORY_SPACE:
150 return SRE_IS_SPACE(ch);
151 case SRE_CATEGORY_NOT_SPACE:
152 return !SRE_IS_SPACE(ch);
153 case SRE_CATEGORY_WORD:
154 return SRE_IS_WORD(ch);
155 case SRE_CATEGORY_NOT_WORD:
156 return !SRE_IS_WORD(ch);
157 case SRE_CATEGORY_LINEBREAK:
158 return SRE_IS_LINEBREAK(ch);
159 case SRE_CATEGORY_NOT_LINEBREAK:
160 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000161
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000162 case SRE_CATEGORY_LOC_WORD:
163 return SRE_LOC_IS_WORD(ch);
164 case SRE_CATEGORY_LOC_NOT_WORD:
165 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000166
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000167 case SRE_CATEGORY_UNI_DIGIT:
168 return SRE_UNI_IS_DIGIT(ch);
169 case SRE_CATEGORY_UNI_NOT_DIGIT:
170 return !SRE_UNI_IS_DIGIT(ch);
171 case SRE_CATEGORY_UNI_SPACE:
172 return SRE_UNI_IS_SPACE(ch);
173 case SRE_CATEGORY_UNI_NOT_SPACE:
174 return !SRE_UNI_IS_SPACE(ch);
175 case SRE_CATEGORY_UNI_WORD:
176 return SRE_UNI_IS_WORD(ch);
177 case SRE_CATEGORY_UNI_NOT_WORD:
178 return !SRE_UNI_IS_WORD(ch);
179 case SRE_CATEGORY_UNI_LINEBREAK:
180 return SRE_UNI_IS_LINEBREAK(ch);
181 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
182 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000183 }
184 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000185}
186
187/* helpers */
188
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000189static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000190data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000191{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000192 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000194 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000195 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000196 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000197}
198
199static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000200data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000201{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000202 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000203 minsize = state->data_stack_base+size;
204 cursize = state->data_stack_size;
205 if (cursize < minsize) {
206 void* stack;
207 cursize = minsize+minsize/4+1024;
Serhiy Storchaka134f0de2013-09-05 18:01:15 +0300208 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000209 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000210 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000211 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000212 return SRE_ERROR_MEMORY;
213 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000214 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000215 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000216 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000217 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000218}
219
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000220/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000221
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300222#define SRE_CHAR Py_UCS1
223#define SIZEOF_SRE_CHAR 1
224#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300225#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000226
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300227/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000228
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300229#define SRE_CHAR Py_UCS2
230#define SIZEOF_SRE_CHAR 2
231#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300232#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000233
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300234/* generate 32-bit unicode version */
235
236#define SRE_CHAR Py_UCS4
237#define SIZEOF_SRE_CHAR 4
238#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300239#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000240
241/* -------------------------------------------------------------------- */
242/* factories and destructors */
243
244/* see sre.h for object declarations */
Victor Stinnerf5587782013-11-15 23:21:11 +0100245static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600246static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +0000247
248static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +0000249sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +0000250{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +0100251 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +0000252}
253
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000254static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +0000255sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000256{
257 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000258 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000259 return NULL;
260 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000261 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000262 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000263 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +0000264 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000265}
266
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000267LOCAL(void)
268state_reset(SRE_STATE* state)
269{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000270 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000271 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000272
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000273 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000274 state->lastindex = -1;
275
276 state->repeat = NULL;
277
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000278 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000279}
280
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000281static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200282getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300283 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600284 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000285{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000286 /* given a python object, return a data pointer, a length (in
287 characters), and a character size. return NULL if the object
288 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000289
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000290 /* Unicode objects do not support the buffer API. So, get the data
291 directly instead. */
292 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293 if (PyUnicode_READY(string) == -1)
294 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200295 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200296 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300297 *p_isbytes = 0;
298 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000299 }
300
Victor Stinner0058b862011-09-29 03:27:47 +0200301 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300302 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
303 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
304 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000305 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000306
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300307 *p_length = view->len;
308 *p_charsize = 1;
309 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000310
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300311 if (view->buf == NULL) {
312 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
313 PyBuffer_Release(view);
314 view->buf = NULL;
315 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000316 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300317 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000318}
319
320LOCAL(PyObject*)
321state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000322 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000323{
324 /* prepare state object */
325
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000326 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300327 int isbytes, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000328 void* ptr;
329
330 memset(state, 0, sizeof(SRE_STATE));
331
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300332 state->mark = PyMem_New(void *, pattern->groups * 2);
333 if (!state->mark) {
334 PyErr_NoMemory();
335 goto err;
336 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000337 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000338 state->lastindex = -1;
339
Benjamin Petersone48944b2012-03-07 14:50:25 -0600340 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300341 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000342 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600343 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000344
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300345 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600346 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300347 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600348 goto err;
349 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300350 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600351 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300352 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600353 goto err;
354 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000355
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000356 /* adjust boundaries */
357 if (start < 0)
358 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000359 else if (start > length)
360 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000361
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000362 if (end < 0)
363 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000364 else if (end > length)
365 end = length;
366
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300367 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000368 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +0000369
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000370 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000371
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000372 state->start = (void*) ((char*) ptr + start * state->charsize);
373 state->end = (void*) ((char*) ptr + end * state->charsize);
374
375 Py_INCREF(string);
376 state->string = string;
377 state->pos = start;
378 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000379
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000380 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000381 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000382 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +0000383 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000384 else
Fredrik Lundhb389df32000-06-29 12:48:37 +0000385 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000387 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600388 err:
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300389 PyMem_Del(state->mark);
390 state->mark = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600391 if (state->buffer.buf)
392 PyBuffer_Release(&state->buffer);
393 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000394}
395
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000396LOCAL(void)
397state_fini(SRE_STATE* state)
398{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600399 if (state->buffer.buf)
400 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000401 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000402 data_stack_dealloc(state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300403 PyMem_Del(state->mark);
404 state->mark = NULL;
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000405}
406
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000407/* calculate offset from start of string */
408#define STATE_OFFSET(state, member)\
409 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
410
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000411LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300412getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300413 PyObject* string, Py_ssize_t start, Py_ssize_t end)
414{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300415 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300416 if (PyBytes_CheckExact(string) &&
417 start == 0 && end == PyBytes_GET_SIZE(string)) {
418 Py_INCREF(string);
419 return string;
420 }
421 return PyBytes_FromStringAndSize(
422 (const char *)ptr + start, end - start);
423 }
424 else {
425 return PyUnicode_Substring(string, start, end);
426 }
427}
428
429LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000430state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000431{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000432 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000433
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000434 index = (index - 1) * 2;
435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000437 if (empty)
438 /* want empty string */
439 i = j = 0;
440 else {
441 Py_INCREF(Py_None);
442 return Py_None;
443 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000444 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000445 i = STATE_OFFSET(state, state->mark[index]);
446 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000447 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000448
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300449 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000450}
451
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000452static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100453pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000454{
455 switch (status) {
456 case SRE_ERROR_RECURSION_LIMIT:
457 PyErr_SetString(
458 PyExc_RuntimeError,
459 "maximum recursion limit exceeded"
460 );
461 break;
462 case SRE_ERROR_MEMORY:
463 PyErr_NoMemory();
464 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000465 case SRE_ERROR_INTERRUPTED:
466 /* An exception has already been raised, so let it fly */
467 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000468 default:
469 /* other error codes indicate compiler/engine bugs */
470 PyErr_SetString(
471 PyExc_RuntimeError,
472 "internal error in regular expression engine"
473 );
474 }
475}
476
Guido van Rossumb700df92000-03-31 14:59:30 +0000477static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000478pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000479{
Raymond Hettinger027bb632004-05-31 03:09:25 +0000480 if (self->weakreflist != NULL)
481 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000482 Py_XDECREF(self->pattern);
483 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000484 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000485 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +0000486}
487
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300488LOCAL(Py_ssize_t)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300489sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300490{
491 if (state->charsize == 1)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300492 return sre_ucs1_match(state, pattern, match_all);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300493 if (state->charsize == 2)
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300494 return sre_ucs2_match(state, pattern, match_all);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300495 assert(state->charsize == 4);
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300496 return sre_ucs4_match(state, pattern, match_all);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300497}
498
499LOCAL(Py_ssize_t)
500sre_search(SRE_STATE* state, SRE_CODE* pattern)
501{
502 if (state->charsize == 1)
503 return sre_ucs1_search(state, pattern);
504 if (state->charsize == 2)
505 return sre_ucs2_search(state, pattern);
506 assert(state->charsize == 4);
507 return sre_ucs4_search(state, pattern);
508}
509
Larry Hastings16c51912014-01-07 11:53:01 -0800510static PyObject *
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200511fix_string_param(PyObject *string, PyObject *string2, const char *oldname)
512{
513 if (string2 != NULL) {
514 if (string != NULL) {
515 PyErr_Format(PyExc_TypeError,
516 "Argument given by name ('%s') and position (1)",
517 oldname);
518 return NULL;
519 }
520 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
521 "The '%s' keyword parameter name is deprecated. "
522 "Use 'string' instead.", oldname) < 0)
523 return NULL;
524 return string2;
525 }
526 if (string == NULL) {
527 PyErr_SetString(PyExc_TypeError,
528 "Required argument 'string' (pos 1) not found");
529 return NULL;
530 }
531 return string;
532}
Larry Hastings16c51912014-01-07 11:53:01 -0800533
534static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -0800535pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
Larry Hastings16c51912014-01-07 11:53:01 -0800536{
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200537 static char *_keywords[] = {"string", "pos", "endpos", "pattern", NULL};
538 PyObject *string = NULL;
Larry Hastings16c51912014-01-07 11:53:01 -0800539 Py_ssize_t pos = 0;
540 Py_ssize_t endpos = PY_SSIZE_T_MAX;
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200541 PyObject *pattern = NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000542 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100543 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300544 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000545
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200546 if (!PyArg_ParseTupleAndKeywords(args, kwargs,
547 "|Onn$O:match", _keywords,
548 &string, &pos, &endpos, &pattern))
549 return NULL;
550 string = fix_string_param(string, pattern, "pattern");
551 if (!string)
552 return NULL;
553 string = state_init(&state, (PatternObject *)self, string, pos, endpos);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000554 if (!string)
555 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000556
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000557 state.ptr = state.start;
558
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000559 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
560
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300561 status = sre_match(&state, PatternObject_GetCode(self), 0);
Guido van Rossumb700df92000-03-31 14:59:30 +0000562
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000563 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300564 if (PyErr_Occurred()) {
565 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000566 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300567 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000568
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300569 match = pattern_new_match(self, &state, status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000570 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300571 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000572}
573
574static PyObject*
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200575pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
576{
577 SRE_STATE state;
578 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300579 PyObject *match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200580
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200581 PyObject *string = NULL, *string2 = NULL;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200582 Py_ssize_t start = 0;
583 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200584 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
Serhiy Storchakaa537eb42014-03-06 11:36:15 +0200585 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:fullmatch", kwlist,
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200586 &string, &start, &end, &string2))
587 return NULL;
588
589 string = fix_string_param(string, string2, "pattern");
590 if (!string)
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200591 return NULL;
592
593 string = state_init(&state, self, string, start, end);
594 if (!string)
595 return NULL;
596
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200597 state.ptr = state.start;
598
599 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
600
Serhiy Storchaka429b59e2014-05-14 21:48:17 +0300601 status = sre_match(&state, PatternObject_GetCode(self), 1);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200602
603 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300604 if (PyErr_Occurred()) {
605 state_fini(&state);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200606 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300607 }
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200608
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300609 match = pattern_new_match(self, &state, status);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200610 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300611 return match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200612}
613
614static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000615pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000616{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000617 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100618 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300619 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000620
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200621 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000622 Py_ssize_t start = 0;
623 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200624 static char* kwlist[] = { "string", "pos", "endpos", "pattern", NULL };
625 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:search", kwlist,
626 &string, &start, &end, &string2))
627 return NULL;
628
629 string = fix_string_param(string, string2, "pattern");
630 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000631 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000632
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000633 string = state_init(&state, self, string, start, end);
634 if (!string)
635 return NULL;
636
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000637 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
638
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300639 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000640
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000641 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
642
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300643 if (PyErr_Occurred()) {
644 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000645 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300646 }
Thomas Wouters89f507f2006-12-13 04:49:30 +0000647
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300648 match = pattern_new_match(self, &state, status);
649 state_fini(&state);
650 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000651}
652
653static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000654call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000655{
656 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000657 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000658 PyObject* func;
659 PyObject* result;
660
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000661 if (!args)
662 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000663 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000664 if (!name)
665 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000666 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000667 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000668 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000669 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000670 func = PyObject_GetAttrString(mod, function);
671 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000672 if (!func)
673 return NULL;
674 result = PyObject_CallObject(func, args);
675 Py_DECREF(func);
676 Py_DECREF(args);
677 return result;
678}
679
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000680#ifdef USE_BUILTIN_COPY
681static int
682deepcopy(PyObject** object, PyObject* memo)
683{
684 PyObject* copy;
685
686 copy = call(
687 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +0000688 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000689 );
690 if (!copy)
691 return 0;
692
693 Py_DECREF(*object);
694 *object = copy;
695
696 return 1; /* success */
697}
698#endif
699
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000700static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +0000701pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +0000702{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000703 SRE_STATE state;
704 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100705 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000706 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000707
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200708 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000709 Py_ssize_t start = 0;
710 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200711 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
712 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:findall", kwlist,
713 &string, &start, &end, &string2))
714 return NULL;
715
716 string = fix_string_param(string, string2, "source");
717 if (!string)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000718 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000719
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000720 string = state_init(&state, self, string, start, end);
721 if (!string)
722 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000723
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000724 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000725 if (!list) {
726 state_fini(&state);
727 return NULL;
728 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000729
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000730 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000731
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000732 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000733
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000734 state_reset(&state);
735
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000736 state.ptr = state.start;
737
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300738 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300739 if (PyErr_Occurred())
740 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000741
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000742 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000743 if (status == 0)
744 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000745 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000746 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000747 }
Tim Peters3d563502006-01-21 02:47:53 +0000748
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000749 /* don't bother to build a match object */
750 switch (self->groups) {
751 case 0:
752 b = STATE_OFFSET(&state, state.start);
753 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300754 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300755 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000756 if (!item)
757 goto error;
758 break;
759 case 1:
760 item = state_getslice(&state, 1, string, 1);
761 if (!item)
762 goto error;
763 break;
764 default:
765 item = PyTuple_New(self->groups);
766 if (!item)
767 goto error;
768 for (i = 0; i < self->groups; i++) {
769 PyObject* o = state_getslice(&state, i+1, string, 1);
770 if (!o) {
771 Py_DECREF(item);
772 goto error;
773 }
774 PyTuple_SET_ITEM(item, i, o);
775 }
776 break;
777 }
778
779 status = PyList_Append(list, item);
780 Py_DECREF(item);
781 if (status < 0)
782 goto error;
783
784 if (state.ptr == state.start)
785 state.start = (void*) ((char*) state.ptr + state.charsize);
786 else
787 state.start = state.ptr;
788
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000789 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000790
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000791 state_fini(&state);
792 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000793
794error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000795 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000796 state_fini(&state);
797 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000798
Guido van Rossumb700df92000-03-31 14:59:30 +0000799}
800
Fredrik Lundh703ce812001-10-24 22:16:30 +0000801static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600802pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +0000803{
804 PyObject* scanner;
805 PyObject* search;
806 PyObject* iterator;
807
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600808 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000809 if (!scanner)
810 return NULL;
811
812 search = PyObject_GetAttrString(scanner, "search");
813 Py_DECREF(scanner);
814 if (!search)
815 return NULL;
816
817 iterator = PyCallIter_New(search, Py_None);
818 Py_DECREF(search);
819
820 return iterator;
821}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000822
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000823static PyObject*
824pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
825{
826 SRE_STATE state;
827 PyObject* list;
828 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100829 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000830 Py_ssize_t n;
831 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000832 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000833
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200834 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000835 Py_ssize_t maxsplit = 0;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +0200836 static char* kwlist[] = { "string", "maxsplit", "source", NULL };
837 if (!PyArg_ParseTupleAndKeywords(args, kw, "|On$O:split", kwlist,
838 &string, &maxsplit, &string2))
839 return NULL;
840
841 string = fix_string_param(string, string2, "source");
842 if (!string)
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000843 return NULL;
844
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000845 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000846 if (!string)
847 return NULL;
848
849 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000850 if (!list) {
851 state_fini(&state);
852 return NULL;
853 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000854
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000855 n = 0;
856 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000857
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000858 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000859
860 state_reset(&state);
861
862 state.ptr = state.start;
863
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300864 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300865 if (PyErr_Occurred())
866 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000867
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000868 if (status <= 0) {
869 if (status == 0)
870 break;
871 pattern_error(status);
872 goto error;
873 }
Tim Peters3d563502006-01-21 02:47:53 +0000874
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000875 if (state.start == state.ptr) {
876 if (last == state.end)
877 break;
878 /* skip one character */
879 state.start = (void*) ((char*) state.ptr + state.charsize);
880 continue;
881 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000882
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000883 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300884 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000885 string, STATE_OFFSET(&state, last),
886 STATE_OFFSET(&state, state.start)
887 );
888 if (!item)
889 goto error;
890 status = PyList_Append(list, item);
891 Py_DECREF(item);
892 if (status < 0)
893 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000894
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000895 /* add groups (if any) */
896 for (i = 0; i < self->groups; i++) {
897 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000898 if (!item)
899 goto error;
900 status = PyList_Append(list, item);
901 Py_DECREF(item);
902 if (status < 0)
903 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000904 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000905
906 n = n + 1;
907
908 last = state.start = state.ptr;
909
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000910 }
911
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000912 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300913 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000914 string, STATE_OFFSET(&state, last), state.endpos
915 );
916 if (!item)
917 goto error;
918 status = PyList_Append(list, item);
919 Py_DECREF(item);
920 if (status < 0)
921 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000922
923 state_fini(&state);
924 return list;
925
926error:
927 Py_DECREF(list);
928 state_fini(&state);
929 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000930
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000931}
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000932
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000933static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000934pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000935 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000936{
937 SRE_STATE state;
938 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +0300939 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000940 PyObject* item;
941 PyObject* filter;
942 PyObject* args;
943 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000944 void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100945 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000946 Py_ssize_t n;
947 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300948 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000949 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600950 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000951
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000952 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +0000953 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000954 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +0000955 Py_INCREF(filter);
956 filter_is_callable = 1;
957 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000958 /* if not callable, check if it's a literal string */
959 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600960 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300961 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000963 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300964 if (charsize == 1)
965 literal = memchr(ptr, '\\', n) == NULL;
966 else
967 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000968 } else {
969 PyErr_Clear();
970 literal = 0;
971 }
Benjamin Petersone48944b2012-03-07 14:50:25 -0600972 if (view.buf)
973 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000974 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000975 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000976 Py_INCREF(filter);
977 filter_is_callable = 0;
978 } else {
979 /* not a literal; hand it over to the template compiler */
980 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +0000981 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000982 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000983 );
984 if (!filter)
985 return NULL;
986 filter_is_callable = PyCallable_Check(filter);
987 }
Fredrik Lundhdac58492001-10-21 21:48:30 +0000988 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000989
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000990 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +0000991 if (!string) {
992 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000993 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +0000994 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000995
996 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000997 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +0000998 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000999 state_fini(&state);
1000 return NULL;
1001 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001002
1003 n = i = 0;
1004
1005 while (!count || n < count) {
1006
1007 state_reset(&state);
1008
1009 state.ptr = state.start;
1010
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001011 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001012 if (PyErr_Occurred())
1013 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001014
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001015 if (status <= 0) {
1016 if (status == 0)
1017 break;
1018 pattern_error(status);
1019 goto error;
1020 }
Tim Peters3d563502006-01-21 02:47:53 +00001021
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001022 b = STATE_OFFSET(&state, state.start);
1023 e = STATE_OFFSET(&state, state.ptr);
1024
1025 if (i < b) {
1026 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001027 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001028 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001029 if (!item)
1030 goto error;
1031 status = PyList_Append(list, item);
1032 Py_DECREF(item);
1033 if (status < 0)
1034 goto error;
1035
1036 } else if (i == b && i == e && n > 0)
1037 /* ignore empty match on latest position */
1038 goto next;
1039
1040 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001041 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001042 match = pattern_new_match(self, &state, 1);
1043 if (!match)
1044 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001045 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001046 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00001047 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001048 goto error;
1049 }
1050 item = PyObject_CallObject(filter, args);
1051 Py_DECREF(args);
1052 Py_DECREF(match);
1053 if (!item)
1054 goto error;
1055 } else {
1056 /* filter is literal string */
1057 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001058 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001059 }
1060
1061 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001062 if (item != Py_None) {
1063 status = PyList_Append(list, item);
1064 Py_DECREF(item);
1065 if (status < 0)
1066 goto error;
1067 }
Tim Peters3d563502006-01-21 02:47:53 +00001068
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001069 i = e;
1070 n = n + 1;
1071
1072next:
1073 /* move on */
1074 if (state.ptr == state.start)
1075 state.start = (void*) ((char*) state.ptr + state.charsize);
1076 else
1077 state.start = state.ptr;
1078
1079 }
1080
1081 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001082 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001083 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001084 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001085 if (!item)
1086 goto error;
1087 status = PyList_Append(list, item);
1088 Py_DECREF(item);
1089 if (status < 0)
1090 goto error;
1091 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001092
1093 state_fini(&state);
1094
Guido van Rossum4e173842001-12-07 04:25:10 +00001095 Py_DECREF(filter);
1096
Fredrik Lundhdac58492001-10-21 21:48:30 +00001097 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001098 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001099 if (!joiner) {
1100 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001101 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001102 }
1103 if (PyList_GET_SIZE(list) == 0) {
1104 Py_DECREF(list);
1105 item = joiner;
1106 }
1107 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001108 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001109 item = _PyBytes_Join(joiner, list);
1110 else
1111 item = PyUnicode_Join(joiner, list);
1112 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001113 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001114 if (!item)
1115 return NULL;
1116 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001117
1118 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001119 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001120
1121 return item;
1122
1123error:
1124 Py_DECREF(list);
1125 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001126 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001127 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001128
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001129}
1130
1131static PyObject*
1132pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
1133{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001134 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001135 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001136 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001137 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001138 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001139 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001140 return NULL;
1141
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001142 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001143}
1144
1145static PyObject*
1146pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
1147{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001148 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001149 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001150 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001151 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001152 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001153 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001154 return NULL;
1155
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001156 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001157}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001158
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001159static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001160pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001161{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001162#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001163 PatternObject* copy;
1164 int offset;
1165
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001166 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1167 if (!copy)
1168 return NULL;
1169
1170 offset = offsetof(PatternObject, groups);
1171
1172 Py_XINCREF(self->groupindex);
1173 Py_XINCREF(self->indexgroup);
1174 Py_XINCREF(self->pattern);
1175
1176 memcpy((char*) copy + offset, (char*) self + offset,
1177 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00001178 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001179
1180 return (PyObject*) copy;
1181#else
1182 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1183 return NULL;
1184#endif
1185}
1186
1187static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001188pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001189{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001190#ifdef USE_BUILTIN_COPY
1191 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00001192
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001193 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001194 if (!copy)
1195 return NULL;
1196
1197 if (!deepcopy(&copy->groupindex, memo) ||
1198 !deepcopy(&copy->indexgroup, memo) ||
1199 !deepcopy(&copy->pattern, memo)) {
1200 Py_DECREF(copy);
1201 return NULL;
1202 }
1203
1204#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001205 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1206 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001207#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001208}
1209
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001210static PyObject *
1211pattern_repr(PatternObject *obj)
1212{
1213 static const struct {
1214 const char *name;
1215 int value;
1216 } flag_names[] = {
1217 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1218 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1219 {"re.LOCALE", SRE_FLAG_LOCALE},
1220 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1221 {"re.DOTALL", SRE_FLAG_DOTALL},
1222 {"re.UNICODE", SRE_FLAG_UNICODE},
1223 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1224 {"re.DEBUG", SRE_FLAG_DEBUG},
1225 {"re.ASCII", SRE_FLAG_ASCII},
1226 };
1227 PyObject *result = NULL;
1228 PyObject *flag_items;
Victor Stinner706768c2014-08-16 01:03:39 +02001229 size_t i;
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001230 int flags = obj->flags;
1231
1232 /* Omit re.UNICODE for valid string patterns. */
1233 if (obj->isbytes == 0 &&
1234 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1235 SRE_FLAG_UNICODE)
1236 flags &= ~SRE_FLAG_UNICODE;
1237
1238 flag_items = PyList_New(0);
1239 if (!flag_items)
1240 return NULL;
1241
1242 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1243 if (flags & flag_names[i].value) {
1244 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1245 if (!item)
1246 goto done;
1247
1248 if (PyList_Append(flag_items, item) < 0) {
1249 Py_DECREF(item);
1250 goto done;
1251 }
1252 Py_DECREF(item);
1253 flags &= ~flag_names[i].value;
1254 }
1255 }
1256 if (flags) {
1257 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1258 if (!item)
1259 goto done;
1260
1261 if (PyList_Append(flag_items, item) < 0) {
1262 Py_DECREF(item);
1263 goto done;
1264 }
1265 Py_DECREF(item);
1266 }
1267
1268 if (PyList_Size(flag_items) > 0) {
1269 PyObject *flags_result;
1270 PyObject *sep = PyUnicode_FromString("|");
1271 if (!sep)
1272 goto done;
1273 flags_result = PyUnicode_Join(sep, flag_items);
1274 Py_DECREF(sep);
1275 if (!flags_result)
1276 goto done;
1277 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1278 obj->pattern, flags_result);
1279 Py_DECREF(flags_result);
1280 }
1281 else {
1282 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1283 }
1284
1285done:
1286 Py_DECREF(flag_items);
1287 return result;
1288}
1289
Raymond Hettinger94478742004-09-24 04:31:19 +00001290PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001291"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001292 Matches zero or more characters at the beginning of the string");
1293
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001294PyDoc_STRVAR(pattern_fullmatch_doc,
1295"fullmatch(string[, pos[, endpos]]) -> match object or None.\n\
1296 Matches against all of the string");
1297
Raymond Hettinger94478742004-09-24 04:31:19 +00001298PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001299"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001300 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02001301 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001302
1303PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001304"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001305 Split string by the occurrences of pattern.");
1306
1307PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001308"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001309 Return a list of all non-overlapping matches of pattern in string.");
1310
1311PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001312"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001313 Return an iterator over all non-overlapping matches for the \n\
1314 RE pattern in string. For each match, the iterator returns a\n\
1315 match object.");
1316
1317PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001318"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001319 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00001320 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001321
1322PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02001323"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00001324 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
1325 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00001326 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001327
1328PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
1329
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001330static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00001331 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001332 pattern_match_doc},
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02001333 {"fullmatch", (PyCFunction) pattern_fullmatch, METH_VARARGS|METH_KEYWORDS,
1334 pattern_fullmatch_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001335 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001336 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001337 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001338 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00001339 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001340 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001341 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001342 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00001343 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001344 pattern_findall_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001345 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001346 pattern_finditer_doc},
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001347 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001348 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
1349 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001350 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00001351};
1352
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00001353#define PAT_OFF(x) offsetof(PatternObject, x)
1354static PyMemberDef pattern_members[] = {
1355 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
1356 {"flags", T_INT, PAT_OFF(flags), READONLY},
1357 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
1358 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
1359 {NULL} /* Sentinel */
1360};
Guido van Rossumb700df92000-03-31 14:59:30 +00001361
Neal Norwitz57c179c2006-03-22 07:18:02 +00001362static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001363 PyVarObject_HEAD_INIT(NULL, 0)
1364 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00001365 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001366 (destructor)pattern_dealloc, /* tp_dealloc */
1367 0, /* tp_print */
1368 0, /* tp_getattr */
1369 0, /* tp_setattr */
1370 0, /* tp_reserved */
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001371 (reprfunc)pattern_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001372 0, /* tp_as_number */
1373 0, /* tp_as_sequence */
1374 0, /* tp_as_mapping */
1375 0, /* tp_hash */
1376 0, /* tp_call */
1377 0, /* tp_str */
1378 0, /* tp_getattro */
1379 0, /* tp_setattro */
1380 0, /* tp_as_buffer */
1381 Py_TPFLAGS_DEFAULT, /* tp_flags */
1382 pattern_doc, /* tp_doc */
1383 0, /* tp_traverse */
1384 0, /* tp_clear */
1385 0, /* tp_richcompare */
1386 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
1387 0, /* tp_iter */
1388 0, /* tp_iternext */
1389 pattern_methods, /* tp_methods */
1390 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00001391};
1392
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001393static int _validate(PatternObject *self); /* Forward */
1394
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001395static PyObject *
1396_compile(PyObject* self_, PyObject* args)
1397{
1398 /* "compile" pattern descriptor to pattern object */
1399
1400 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001401 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001402
1403 PyObject* pattern;
1404 int flags = 0;
1405 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001406 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001407 PyObject* groupindex = NULL;
1408 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001409
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001410 if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411 &PyList_Type, &code, &groups,
1412 &groupindex, &indexgroup))
1413 return NULL;
1414
1415 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001416 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001417 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1418 if (!self)
1419 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001420 self->weakreflist = NULL;
1421 self->pattern = NULL;
1422 self->groupindex = NULL;
1423 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424
1425 self->codesize = n;
1426
1427 for (i = 0; i < n; i++) {
1428 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001429 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001430 self->code[i] = (SRE_CODE) value;
1431 if ((unsigned long) self->code[i] != value) {
1432 PyErr_SetString(PyExc_OverflowError,
1433 "regular expression code size limit exceeded");
1434 break;
1435 }
1436 }
1437
1438 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001439 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001440 return NULL;
1441 }
1442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001444 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 else {
1447 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001448 int charsize;
1449 Py_buffer view;
1450 view.buf = NULL;
1451 if (!getstring(pattern, &p_length, &self->isbytes,
1452 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 Py_DECREF(self);
1454 return NULL;
1455 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001456 if (view.buf)
1457 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460 Py_INCREF(pattern);
1461 self->pattern = pattern;
1462
1463 self->flags = flags;
1464
1465 self->groups = groups;
1466
1467 Py_XINCREF(groupindex);
1468 self->groupindex = groupindex;
1469
1470 Py_XINCREF(indexgroup);
1471 self->indexgroup = indexgroup;
1472
1473 self->weakreflist = NULL;
1474
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001475 if (!_validate(self)) {
1476 Py_DECREF(self);
1477 return NULL;
1478 }
1479
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001480 return (PyObject*) self;
1481}
1482
Guido van Rossumb700df92000-03-31 14:59:30 +00001483/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001484/* Code validation */
1485
1486/* To learn more about this code, have a look at the _compile() function in
1487 Lib/sre_compile.py. The validation functions below checks the code array
1488 for conformance with the code patterns generated there.
1489
1490 The nice thing about the generated code is that it is position-independent:
1491 all jumps are relative jumps forward. Also, jumps don't cross each other:
1492 the target of a later jump is always earlier than the target of an earlier
1493 jump. IOW, this is okay:
1494
1495 J---------J-------T--------T
1496 \ \_____/ /
1497 \______________________/
1498
1499 but this is not:
1500
1501 J---------J-------T--------T
1502 \_________\_____/ /
1503 \____________/
1504
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001505 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001506*/
1507
1508/* Defining this one enables tracing of the validator */
1509#undef VVERBOSE
1510
1511/* Trace macro for the validator */
1512#if defined(VVERBOSE)
1513#define VTRACE(v) printf v
1514#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001515#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001516#endif
1517
1518/* Report failure */
1519#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1520
1521/* Extract opcode, argument, or skip count from code array */
1522#define GET_OP \
1523 do { \
1524 VTRACE(("%p: ", code)); \
1525 if (code >= end) FAIL; \
1526 op = *code++; \
1527 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1528 } while (0)
1529#define GET_ARG \
1530 do { \
1531 VTRACE(("%p= ", code)); \
1532 if (code >= end) FAIL; \
1533 arg = *code++; \
1534 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1535 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001536#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001537 do { \
1538 VTRACE(("%p= ", code)); \
1539 if (code >= end) FAIL; \
1540 skip = *code; \
1541 VTRACE(("%lu (skip to %p)\n", \
1542 (unsigned long)skip, code+skip)); \
Victor Stinner1fa174a2013-08-28 02:06:21 +02001543 if (skip-adj > (Py_uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001544 FAIL; \
1545 code++; \
1546 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001547#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001548
1549static int
1550_validate_charset(SRE_CODE *code, SRE_CODE *end)
1551{
1552 /* Some variables are manipulated by the macros above */
1553 SRE_CODE op;
1554 SRE_CODE arg;
1555 SRE_CODE offset;
1556 int i;
1557
1558 while (code < end) {
1559 GET_OP;
1560 switch (op) {
1561
1562 case SRE_OP_NEGATE:
1563 break;
1564
1565 case SRE_OP_LITERAL:
1566 GET_ARG;
1567 break;
1568
1569 case SRE_OP_RANGE:
1570 GET_ARG;
1571 GET_ARG;
1572 break;
1573
1574 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001575 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001576 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001577 FAIL;
1578 code += offset;
1579 break;
1580
1581 case SRE_OP_BIGCHARSET:
1582 GET_ARG; /* Number of blocks */
1583 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001584 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001585 FAIL;
1586 /* Make sure that each byte points to a valid block */
1587 for (i = 0; i < 256; i++) {
1588 if (((unsigned char *)code)[i] >= arg)
1589 FAIL;
1590 }
1591 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001592 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001593 if (offset > (Py_uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001594 FAIL;
1595 code += offset;
1596 break;
1597
1598 case SRE_OP_CATEGORY:
1599 GET_ARG;
1600 switch (arg) {
1601 case SRE_CATEGORY_DIGIT:
1602 case SRE_CATEGORY_NOT_DIGIT:
1603 case SRE_CATEGORY_SPACE:
1604 case SRE_CATEGORY_NOT_SPACE:
1605 case SRE_CATEGORY_WORD:
1606 case SRE_CATEGORY_NOT_WORD:
1607 case SRE_CATEGORY_LINEBREAK:
1608 case SRE_CATEGORY_NOT_LINEBREAK:
1609 case SRE_CATEGORY_LOC_WORD:
1610 case SRE_CATEGORY_LOC_NOT_WORD:
1611 case SRE_CATEGORY_UNI_DIGIT:
1612 case SRE_CATEGORY_UNI_NOT_DIGIT:
1613 case SRE_CATEGORY_UNI_SPACE:
1614 case SRE_CATEGORY_UNI_NOT_SPACE:
1615 case SRE_CATEGORY_UNI_WORD:
1616 case SRE_CATEGORY_UNI_NOT_WORD:
1617 case SRE_CATEGORY_UNI_LINEBREAK:
1618 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1619 break;
1620 default:
1621 FAIL;
1622 }
1623 break;
1624
1625 default:
1626 FAIL;
1627
1628 }
1629 }
1630
1631 return 1;
1632}
1633
1634static int
1635_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1636{
1637 /* Some variables are manipulated by the macros above */
1638 SRE_CODE op;
1639 SRE_CODE arg;
1640 SRE_CODE skip;
1641
1642 VTRACE(("code=%p, end=%p\n", code, end));
1643
1644 if (code > end)
1645 FAIL;
1646
1647 while (code < end) {
1648 GET_OP;
1649 switch (op) {
1650
1651 case SRE_OP_MARK:
1652 /* We don't check whether marks are properly nested; the
1653 sre_match() code is robust even if they don't, and the worst
1654 you can get is nonsensical match results. */
1655 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001656 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001657 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1658 FAIL;
1659 }
1660 break;
1661
1662 case SRE_OP_LITERAL:
1663 case SRE_OP_NOT_LITERAL:
1664 case SRE_OP_LITERAL_IGNORE:
1665 case SRE_OP_NOT_LITERAL_IGNORE:
1666 GET_ARG;
1667 /* The arg is just a character, nothing to check */
1668 break;
1669
1670 case SRE_OP_SUCCESS:
1671 case SRE_OP_FAILURE:
1672 /* Nothing to check; these normally end the matching process */
1673 break;
1674
1675 case SRE_OP_AT:
1676 GET_ARG;
1677 switch (arg) {
1678 case SRE_AT_BEGINNING:
1679 case SRE_AT_BEGINNING_STRING:
1680 case SRE_AT_BEGINNING_LINE:
1681 case SRE_AT_END:
1682 case SRE_AT_END_LINE:
1683 case SRE_AT_END_STRING:
1684 case SRE_AT_BOUNDARY:
1685 case SRE_AT_NON_BOUNDARY:
1686 case SRE_AT_LOC_BOUNDARY:
1687 case SRE_AT_LOC_NON_BOUNDARY:
1688 case SRE_AT_UNI_BOUNDARY:
1689 case SRE_AT_UNI_NON_BOUNDARY:
1690 break;
1691 default:
1692 FAIL;
1693 }
1694 break;
1695
1696 case SRE_OP_ANY:
1697 case SRE_OP_ANY_ALL:
1698 /* These have no operands */
1699 break;
1700
1701 case SRE_OP_IN:
1702 case SRE_OP_IN_IGNORE:
1703 GET_SKIP;
1704 /* Stop 1 before the end; we check the FAILURE below */
1705 if (!_validate_charset(code, code+skip-2))
1706 FAIL;
1707 if (code[skip-2] != SRE_OP_FAILURE)
1708 FAIL;
1709 code += skip-1;
1710 break;
1711
1712 case SRE_OP_INFO:
1713 {
1714 /* A minimal info field is
1715 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1716 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1717 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001718 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001719 SRE_CODE *newcode;
1720 GET_SKIP;
1721 newcode = code+skip-1;
1722 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001723 GET_ARG;
1724 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001725 /* Check that only valid flags are present */
1726 if ((flags & ~(SRE_INFO_PREFIX |
1727 SRE_INFO_LITERAL |
1728 SRE_INFO_CHARSET)) != 0)
1729 FAIL;
1730 /* PREFIX and CHARSET are mutually exclusive */
1731 if ((flags & SRE_INFO_PREFIX) &&
1732 (flags & SRE_INFO_CHARSET))
1733 FAIL;
1734 /* LITERAL implies PREFIX */
1735 if ((flags & SRE_INFO_LITERAL) &&
1736 !(flags & SRE_INFO_PREFIX))
1737 FAIL;
1738 /* Validate the prefix */
1739 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001740 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001741 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001742 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001743 /* Here comes the prefix string */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001744 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001745 FAIL;
1746 code += prefix_len;
1747 /* And here comes the overlap table */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001748 if (prefix_len > (Py_uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001749 FAIL;
1750 /* Each overlap value should be < prefix_len */
1751 for (i = 0; i < prefix_len; i++) {
1752 if (code[i] >= prefix_len)
1753 FAIL;
1754 }
1755 code += prefix_len;
1756 }
1757 /* Validate the charset */
1758 if (flags & SRE_INFO_CHARSET) {
1759 if (!_validate_charset(code, newcode-1))
1760 FAIL;
1761 if (newcode[-1] != SRE_OP_FAILURE)
1762 FAIL;
1763 code = newcode;
1764 }
1765 else if (code != newcode) {
1766 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1767 FAIL;
1768 }
1769 }
1770 break;
1771
1772 case SRE_OP_BRANCH:
1773 {
1774 SRE_CODE *target = NULL;
1775 for (;;) {
1776 GET_SKIP;
1777 if (skip == 0)
1778 break;
1779 /* Stop 2 before the end; we check the JUMP below */
1780 if (!_validate_inner(code, code+skip-3, groups))
1781 FAIL;
1782 code += skip-3;
1783 /* Check that it ends with a JUMP, and that each JUMP
1784 has the same target */
1785 GET_OP;
1786 if (op != SRE_OP_JUMP)
1787 FAIL;
1788 GET_SKIP;
1789 if (target == NULL)
1790 target = code+skip-1;
1791 else if (code+skip-1 != target)
1792 FAIL;
1793 }
1794 }
1795 break;
1796
1797 case SRE_OP_REPEAT_ONE:
1798 case SRE_OP_MIN_REPEAT_ONE:
1799 {
1800 SRE_CODE min, max;
1801 GET_SKIP;
1802 GET_ARG; min = arg;
1803 GET_ARG; max = arg;
1804 if (min > max)
1805 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001806 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001807 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001808 if (!_validate_inner(code, code+skip-4, groups))
1809 FAIL;
1810 code += skip-4;
1811 GET_OP;
1812 if (op != SRE_OP_SUCCESS)
1813 FAIL;
1814 }
1815 break;
1816
1817 case SRE_OP_REPEAT:
1818 {
1819 SRE_CODE min, max;
1820 GET_SKIP;
1821 GET_ARG; min = arg;
1822 GET_ARG; max = arg;
1823 if (min > max)
1824 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001825 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001826 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001827 if (!_validate_inner(code, code+skip-3, groups))
1828 FAIL;
1829 code += skip-3;
1830 GET_OP;
1831 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1832 FAIL;
1833 }
1834 break;
1835
1836 case SRE_OP_GROUPREF:
1837 case SRE_OP_GROUPREF_IGNORE:
1838 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001839 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001840 FAIL;
1841 break;
1842
1843 case SRE_OP_GROUPREF_EXISTS:
1844 /* The regex syntax for this is: '(?(group)then|else)', where
1845 'group' is either an integer group number or a group name,
1846 'then' and 'else' are sub-regexes, and 'else' is optional. */
1847 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001848 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001849 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001850 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001851 code--; /* The skip is relative to the first arg! */
1852 /* There are two possibilities here: if there is both a 'then'
1853 part and an 'else' part, the generated code looks like:
1854
1855 GROUPREF_EXISTS
1856 <group>
1857 <skipyes>
1858 ...then part...
1859 JUMP
1860 <skipno>
1861 (<skipyes> jumps here)
1862 ...else part...
1863 (<skipno> jumps here)
1864
1865 If there is only a 'then' part, it looks like:
1866
1867 GROUPREF_EXISTS
1868 <group>
1869 <skip>
1870 ...then part...
1871 (<skip> jumps here)
1872
1873 There is no direct way to decide which it is, and we don't want
1874 to allow arbitrary jumps anywhere in the code; so we just look
1875 for a JUMP opcode preceding our skip target.
1876 */
Victor Stinner1fa174a2013-08-28 02:06:21 +02001877 if (skip >= 3 && skip-3 < (Py_uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001878 code[skip-3] == SRE_OP_JUMP)
1879 {
1880 VTRACE(("both then and else parts present\n"));
1881 if (!_validate_inner(code+1, code+skip-3, groups))
1882 FAIL;
1883 code += skip-2; /* Position after JUMP, at <skipno> */
1884 GET_SKIP;
1885 if (!_validate_inner(code, code+skip-1, groups))
1886 FAIL;
1887 code += skip-1;
1888 }
1889 else {
1890 VTRACE(("only a then part present\n"));
1891 if (!_validate_inner(code+1, code+skip-1, groups))
1892 FAIL;
1893 code += skip-1;
1894 }
1895 break;
1896
1897 case SRE_OP_ASSERT:
1898 case SRE_OP_ASSERT_NOT:
1899 GET_SKIP;
1900 GET_ARG; /* 0 for lookahead, width for lookbehind */
1901 code--; /* Back up over arg to simplify math below */
1902 if (arg & 0x80000000)
1903 FAIL; /* Width too large */
1904 /* Stop 1 before the end; we check the SUCCESS below */
1905 if (!_validate_inner(code+1, code+skip-2, groups))
1906 FAIL;
1907 code += skip-2;
1908 GET_OP;
1909 if (op != SRE_OP_SUCCESS)
1910 FAIL;
1911 break;
1912
1913 default:
1914 FAIL;
1915
1916 }
1917 }
1918
1919 VTRACE(("okay\n"));
1920 return 1;
1921}
1922
1923static int
1924_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1925{
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001926 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1927 code >= end || end[-1] != SRE_OP_SUCCESS)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001928 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001929 return _validate_inner(code, end-1, groups);
1930}
1931
1932static int
1933_validate(PatternObject *self)
1934{
1935 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1936 {
1937 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1938 return 0;
1939 }
1940 else
1941 VTRACE(("Success!\n"));
1942 return 1;
1943}
1944
1945/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001946/* match methods */
1947
1948static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001949match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001950{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001951 Py_XDECREF(self->regs);
1952 Py_XDECREF(self->string);
1953 Py_DECREF(self->pattern);
1954 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001955}
1956
1957static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001958match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001959{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001960 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001961 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001962 Py_buffer view;
1963 PyObject *result;
1964 void* ptr;
1965
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001966 if (index < 0 || index >= self->groups) {
1967 /* raise IndexError if we were given a bad group number */
1968 PyErr_SetString(
1969 PyExc_IndexError,
1970 "no such group"
1971 );
1972 return NULL;
1973 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001974
Fredrik Lundh6f013982000-07-03 18:44:21 +00001975 index *= 2;
1976
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001977 if (self->string == Py_None || self->mark[index] < 0) {
1978 /* return default value if the string or group is undefined */
1979 Py_INCREF(def);
1980 return def;
1981 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001982
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001983 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001984 if (ptr == NULL)
1985 return NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001986 result = getslice(isbytes, ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001987 self->string, self->mark[index], self->mark[index+1]);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001988 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001989 PyBuffer_Release(&view);
1990 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001991}
1992
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001993static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001994match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001996 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001997
Guido van Rossumddefaf32007-01-14 03:31:43 +00001998 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001999 /* Default value */
2000 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00002001
Christian Heimes217cfd12007-12-02 14:31:20 +00002002 if (PyLong_Check(index))
2003 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00002004
Fredrik Lundh6f013982000-07-03 18:44:21 +00002005 i = -1;
2006
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002007 if (self->pattern->groupindex) {
2008 index = PyObject_GetItem(self->pattern->groupindex, index);
2009 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00002010 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00002011 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002012 Py_DECREF(index);
2013 } else
2014 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002015 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002016
2017 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002018}
2019
2020static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002021match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002022{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002023 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002024}
2025
2026static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002027match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002028{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002029 /* delegate to Python code */
2030 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002031 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002032 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002033 );
2034}
2035
2036static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002037match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002038{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002039 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002040 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002041
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002042 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002043
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 switch (size) {
2045 case 0:
2046 result = match_getslice(self, Py_False, Py_None);
2047 break;
2048 case 1:
2049 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2050 break;
2051 default:
2052 /* fetch multiple items */
2053 result = PyTuple_New(size);
2054 if (!result)
2055 return NULL;
2056 for (i = 0; i < size; i++) {
2057 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002058 self, PyTuple_GET_ITEM(args, i), Py_None
2059 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 if (!item) {
2061 Py_DECREF(result);
2062 return NULL;
2063 }
2064 PyTuple_SET_ITEM(result, i, item);
2065 }
2066 break;
2067 }
2068 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002069}
2070
2071static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002072match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002073{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002074 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002075 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002076
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002077 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002078 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00002079 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002080 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002081
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002082 result = PyTuple_New(self->groups-1);
2083 if (!result)
2084 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002085
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 for (index = 1; index < self->groups; index++) {
2087 PyObject* item;
2088 item = match_getslice_by_index(self, index, def);
2089 if (!item) {
2090 Py_DECREF(result);
2091 return NULL;
2092 }
2093 PyTuple_SET_ITEM(result, index-1, item);
2094 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002095
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002096 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002097}
2098
2099static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002100match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002101{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 PyObject* result;
2103 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002104 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002105
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002106 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002107 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00002108 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002109 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002110
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002111 result = PyDict_New();
2112 if (!result || !self->pattern->groupindex)
2113 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002114
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002115 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002116 if (!keys)
2117 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002119 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002120 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002121 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002122 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002123 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002124 if (!key)
2125 goto failed;
2126 value = match_getslice(self, key, def);
2127 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002129 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002130 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00002131 status = PyDict_SetItem(result, key, value);
2132 Py_DECREF(value);
2133 if (status < 0)
2134 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002135 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002136
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002137 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00002138
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002139 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002140
2141failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00002142 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002143 Py_DECREF(result);
2144 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002145}
2146
2147static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002148match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002149{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002150 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002151
Guido van Rossumddefaf32007-01-14 03:31:43 +00002152 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002153 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002154 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002155
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002156 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002158 if (index < 0 || index >= self->groups) {
2159 PyErr_SetString(
2160 PyExc_IndexError,
2161 "no such group"
2162 );
2163 return NULL;
2164 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002165
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002166 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002167 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00002168}
2169
2170static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002171match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002172{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002173 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002174
Guido van Rossumddefaf32007-01-14 03:31:43 +00002175 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002176 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002177 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002178
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002179 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002180
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002181 if (index < 0 || index >= self->groups) {
2182 PyErr_SetString(
2183 PyExc_IndexError,
2184 "no such group"
2185 );
2186 return NULL;
2187 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002188
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002189 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002190 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002191}
2192
2193LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002194_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002195{
2196 PyObject* pair;
2197 PyObject* item;
2198
2199 pair = PyTuple_New(2);
2200 if (!pair)
2201 return NULL;
2202
Christian Heimes217cfd12007-12-02 14:31:20 +00002203 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204 if (!item)
2205 goto error;
2206 PyTuple_SET_ITEM(pair, 0, item);
2207
Christian Heimes217cfd12007-12-02 14:31:20 +00002208 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002209 if (!item)
2210 goto error;
2211 PyTuple_SET_ITEM(pair, 1, item);
2212
2213 return pair;
2214
2215 error:
2216 Py_DECREF(pair);
2217 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002218}
2219
2220static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002221match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002222{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002223 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002224
Guido van Rossumddefaf32007-01-14 03:31:43 +00002225 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002226 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002227 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002228
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002229 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002230
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002231 if (index < 0 || index >= self->groups) {
2232 PyErr_SetString(
2233 PyExc_IndexError,
2234 "no such group"
2235 );
2236 return NULL;
2237 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002238
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002239 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002240 return _pair(self->mark[index*2], self->mark[index*2+1]);
2241}
2242
2243static PyObject*
2244match_regs(MatchObject* self)
2245{
2246 PyObject* regs;
2247 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002248 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002249
2250 regs = PyTuple_New(self->groups);
2251 if (!regs)
2252 return NULL;
2253
2254 for (index = 0; index < self->groups; index++) {
2255 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2256 if (!item) {
2257 Py_DECREF(regs);
2258 return NULL;
2259 }
2260 PyTuple_SET_ITEM(regs, index, item);
2261 }
2262
2263 Py_INCREF(regs);
2264 self->regs = regs;
2265
2266 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002267}
2268
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002269static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002270match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002271{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002272#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002273 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002274 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00002275
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002276 slots = 2 * (self->pattern->groups+1);
2277
2278 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2279 if (!copy)
2280 return NULL;
2281
2282 /* this value a constant, but any compiler should be able to
2283 figure that out all by itself */
2284 offset = offsetof(MatchObject, string);
2285
2286 Py_XINCREF(self->pattern);
2287 Py_XINCREF(self->string);
2288 Py_XINCREF(self->regs);
2289
2290 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002291 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002292
2293 return (PyObject*) copy;
2294#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002295 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002296 return NULL;
2297#endif
2298}
2299
2300static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002301match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002302{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002303#ifdef USE_BUILTIN_COPY
2304 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002305
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002306 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002307 if (!copy)
2308 return NULL;
2309
2310 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2311 !deepcopy(&copy->string, memo) ||
2312 !deepcopy(&copy->regs, memo)) {
2313 Py_DECREF(copy);
2314 return NULL;
2315 }
2316
2317#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002318 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2319 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002320#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002321}
2322
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002323PyDoc_STRVAR(match_doc,
2324"The result of re.match() and re.search().\n\
2325Match objects always have a boolean value of True.");
2326
2327PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002328"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002329 Return subgroup(s) of the match by indices or names.\n\
2330 For 0 returns the entire match.");
2331
2332PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002333"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002334 Return index of the start of the substring matched by group.");
2335
2336PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002337"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002338 Return index of the end of the substring matched by group.");
2339
2340PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002341"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002342 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
2343
2344PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002345"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002346 Return a tuple containing all the subgroups of the match, from 1.\n\
2347 The default argument is used for groups\n\
2348 that did not participate in the match");
2349
2350PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002351"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002352 Return a dictionary containing all the named subgroups of the match,\n\
2353 keyed by the subgroup name. The default argument is used for groups\n\
2354 that did not participate in the match");
2355
2356PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002357"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002358 Return the string obtained by doing backslash substitution\n\
2359 on the string template, as done by the sub() method.");
2360
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002361static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002362 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2363 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
2364 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
2365 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
2366 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
2367 match_groups_doc},
2368 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
2369 match_groupdict_doc},
2370 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002371 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
2372 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002373 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002374};
2375
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002376static PyObject *
2377match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00002378{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002379 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002380 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002381 Py_INCREF(Py_None);
2382 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00002383}
2384
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002385static PyObject *
2386match_lastgroup_get(MatchObject *self)
2387{
2388 if (self->pattern->indexgroup && self->lastindex >= 0) {
2389 PyObject* result = PySequence_GetItem(
2390 self->pattern->indexgroup, self->lastindex
2391 );
2392 if (result)
2393 return result;
2394 PyErr_Clear();
2395 }
2396 Py_INCREF(Py_None);
2397 return Py_None;
2398}
2399
2400static PyObject *
2401match_regs_get(MatchObject *self)
2402{
2403 if (self->regs) {
2404 Py_INCREF(self->regs);
2405 return self->regs;
2406 } else
2407 return match_regs(self);
2408}
2409
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002410static PyObject *
2411match_repr(MatchObject *self)
2412{
2413 PyObject *result;
2414 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2415 if (group0 == NULL)
2416 return NULL;
2417 result = PyUnicode_FromFormat(
2418 "<%s object; span=(%d, %d), match=%.50R>",
2419 Py_TYPE(self)->tp_name,
2420 self->mark[0], self->mark[1], group0);
2421 Py_DECREF(group0);
2422 return result;
2423}
2424
2425
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002426static PyGetSetDef match_getset[] = {
2427 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
2428 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
2429 {"regs", (getter)match_regs_get, (setter)NULL},
2430 {NULL}
2431};
2432
2433#define MATCH_OFF(x) offsetof(MatchObject, x)
2434static PyMemberDef match_members[] = {
2435 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
2436 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
2437 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
2438 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
2439 {NULL}
2440};
2441
Guido van Rossumb700df92000-03-31 14:59:30 +00002442/* FIXME: implement setattr("string", None) as a special case (to
2443 detach the associated string, if any */
2444
Neal Norwitz57c179c2006-03-22 07:18:02 +00002445static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002446 PyVarObject_HEAD_INIT(NULL,0)
2447 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002448 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002449 (destructor)match_dealloc, /* tp_dealloc */
2450 0, /* tp_print */
2451 0, /* tp_getattr */
2452 0, /* tp_setattr */
2453 0, /* tp_reserved */
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002454 (reprfunc)match_repr, /* tp_repr */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002455 0, /* tp_as_number */
2456 0, /* tp_as_sequence */
2457 0, /* tp_as_mapping */
2458 0, /* tp_hash */
2459 0, /* tp_call */
2460 0, /* tp_str */
2461 0, /* tp_getattro */
2462 0, /* tp_setattro */
2463 0, /* tp_as_buffer */
2464 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002465 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002466 0, /* tp_traverse */
2467 0, /* tp_clear */
2468 0, /* tp_richcompare */
2469 0, /* tp_weaklistoffset */
2470 0, /* tp_iter */
2471 0, /* tp_iternext */
2472 match_methods, /* tp_methods */
2473 match_members, /* tp_members */
2474 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00002475};
2476
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002477static PyObject*
Victor Stinnerf5587782013-11-15 23:21:11 +01002478pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002479{
2480 /* create match object (from state object) */
2481
2482 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002483 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 char* base;
2485 int n;
2486
2487 if (status > 0) {
2488
2489 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002490 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002491 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
2492 2*(pattern->groups+1));
2493 if (!match)
2494 return NULL;
2495
2496 Py_INCREF(pattern);
2497 match->pattern = pattern;
2498
2499 Py_INCREF(state->string);
2500 match->string = state->string;
2501
2502 match->regs = NULL;
2503 match->groups = pattern->groups+1;
2504
2505 /* fill in group slices */
2506
2507 base = (char*) state->beginning;
2508 n = state->charsize;
2509
2510 match->mark[0] = ((char*) state->start - base) / n;
2511 match->mark[1] = ((char*) state->ptr - base) / n;
2512
2513 for (i = j = 0; i < pattern->groups; i++, j+=2)
2514 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2515 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2516 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2517 } else
2518 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2519
2520 match->pos = state->pos;
2521 match->endpos = state->endpos;
2522
2523 match->lastindex = state->lastindex;
2524
2525 return (PyObject*) match;
2526
2527 } else if (status == 0) {
2528
2529 /* no match */
2530 Py_INCREF(Py_None);
2531 return Py_None;
2532
2533 }
2534
2535 /* internal error */
2536 pattern_error(status);
2537 return NULL;
2538}
2539
2540
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002541/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002542/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002543
2544static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002545scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002546{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002547 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002548 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002549 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002550}
2551
2552static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002553scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002554{
2555 SRE_STATE* state = &self->state;
2556 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002557 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002558
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002559 state_reset(state);
2560
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002561 state->ptr = state->start;
2562
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03002563 status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
Thomas Wouters89f507f2006-12-13 04:49:30 +00002564 if (PyErr_Occurred())
2565 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002566
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002567 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002568 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002569
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00002570 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002571 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002572 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002573 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002574
2575 return match;
2576}
2577
2578
2579static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002580scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002581{
2582 SRE_STATE* state = &self->state;
2583 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002584 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002585
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002586 state_reset(state);
2587
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002588 state->ptr = state->start;
2589
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002590 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002591 if (PyErr_Occurred())
2592 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002593
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002594 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002595 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002596
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00002597 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002598 state->start = (void*) ((char*) state->ptr + state->charsize);
2599 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002600 state->start = state->ptr;
2601
2602 return match;
2603}
2604
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002605static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002606 {"match", (PyCFunction) scanner_match, METH_NOARGS},
2607 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002608 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002609};
2610
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002611#define SCAN_OFF(x) offsetof(ScannerObject, x)
2612static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03002613 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002614 {NULL} /* Sentinel */
2615};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002616
Neal Norwitz57c179c2006-03-22 07:18:02 +00002617static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002618 PyVarObject_HEAD_INIT(NULL, 0)
2619 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002620 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002621 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002622 0, /* tp_print */
2623 0, /* tp_getattr */
2624 0, /* tp_setattr */
2625 0, /* tp_reserved */
2626 0, /* tp_repr */
2627 0, /* tp_as_number */
2628 0, /* tp_as_sequence */
2629 0, /* tp_as_mapping */
2630 0, /* tp_hash */
2631 0, /* tp_call */
2632 0, /* tp_str */
2633 0, /* tp_getattro */
2634 0, /* tp_setattro */
2635 0, /* tp_as_buffer */
2636 Py_TPFLAGS_DEFAULT, /* tp_flags */
2637 0, /* tp_doc */
2638 0, /* tp_traverse */
2639 0, /* tp_clear */
2640 0, /* tp_richcompare */
2641 0, /* tp_weaklistoffset */
2642 0, /* tp_iter */
2643 0, /* tp_iternext */
2644 scanner_methods, /* tp_methods */
2645 scanner_members, /* tp_members */
2646 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002647};
2648
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002649static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002650pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002651{
2652 /* create search state object */
2653
2654 ScannerObject* self;
2655
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02002656 PyObject *string = NULL, *string2 = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002657 Py_ssize_t start = 0;
2658 Py_ssize_t end = PY_SSIZE_T_MAX;
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02002659 static char* kwlist[] = { "string", "pos", "endpos", "source", NULL };
2660 if (!PyArg_ParseTupleAndKeywords(args, kw, "|Onn$O:scanner", kwlist,
2661 &string, &start, &end, &string2))
2662 return NULL;
2663
2664 string = fix_string_param(string, string2, "source");
2665 if (!string)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002666 return NULL;
2667
2668 /* create scanner object */
2669 self = PyObject_NEW(ScannerObject, &Scanner_Type);
2670 if (!self)
2671 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002672 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002673
2674 string = state_init(&self->state, pattern, string, start, end);
2675 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002676 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002677 return NULL;
2678 }
2679
2680 Py_INCREF(pattern);
2681 self->pattern = (PyObject*) pattern;
2682
2683 return (PyObject*) self;
2684}
2685
Guido van Rossumb700df92000-03-31 14:59:30 +00002686static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00002687 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002688 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00002689 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002690 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002691};
2692
Martin v. Löwis1a214512008-06-11 05:26:20 +00002693static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002694 PyModuleDef_HEAD_INIT,
2695 "_" SRE_MODULE,
2696 NULL,
2697 -1,
2698 _functions,
2699 NULL,
2700 NULL,
2701 NULL,
2702 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00002703};
2704
2705PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002706{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002707 PyObject* m;
2708 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002709 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002710
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00002711 /* Patch object types */
2712 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
2713 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00002714 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002715
Martin v. Löwis1a214512008-06-11 05:26:20 +00002716 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00002717 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002718 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002719 d = PyModule_GetDict(m);
2720
Christian Heimes217cfd12007-12-02 14:31:20 +00002721 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002722 if (x) {
2723 PyDict_SetItemString(d, "MAGIC", x);
2724 Py_DECREF(x);
2725 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002726
Christian Heimes217cfd12007-12-02 14:31:20 +00002727 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00002728 if (x) {
2729 PyDict_SetItemString(d, "CODESIZE", x);
2730 Py_DECREF(x);
2731 }
2732
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02002733 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
2734 if (x) {
2735 PyDict_SetItemString(d, "MAXREPEAT", x);
2736 Py_DECREF(x);
2737 }
2738
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03002739 x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
2740 if (x) {
2741 PyDict_SetItemString(d, "MAXGROUPS", x);
2742 Py_DECREF(x);
2743 }
2744
Neal Norwitzfe537132007-08-26 03:55:15 +00002745 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002746 if (x) {
2747 PyDict_SetItemString(d, "copyright", x);
2748 Py_DECREF(x);
2749 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00002750 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00002751}
2752
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002753/* vim:ts=4:sw=4:et
2754*/