blob: fbabeb7c9f3054172321cbbb7349956f29f67ae2 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020038static const char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Victor Stinner37834132020-10-27 17:12:53 +010044#include "pycore_long.h" // _PyLong_GetZero()
Victor Stinner4a21e572020-04-15 02:35:41 +020045#include "structmember.h" // PyMemberDef
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030049#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
50
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000051#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000052
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000054#if !defined(SRE_MODULE)
55#define SRE_MODULE "sre"
56#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057
Thomas Wouters9ada3d62006-04-21 09:47:09 +000058#define SRE_PY_MODULE "re"
59
Guido van Rossumb700df92000-03-31 14:59:30 +000060/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000061#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000063/* -------------------------------------------------------------------- */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064
Fredrik Lundh80946112000-06-29 18:03:25 +000065#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000066#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000067#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000068/* fastest possible local call under MSVC */
69#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070#else
Benjamin Peterson791dc832017-04-20 23:52:19 -070071#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000072#endif
73
74/* error codes */
75#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000076#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000077#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000078#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000079#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000080
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000081#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000082#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000083#else
84#define TRACE(v)
85#endif
86
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000087/* -------------------------------------------------------------------- */
88/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000089
Fredrik Lundh436c3d582000-06-29 08:58:44 +000090#define SRE_IS_DIGIT(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050091 ((ch) <= '9' && Py_ISDIGIT(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000092#define SRE_IS_SPACE(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050093 ((ch) <= ' ' && Py_ISSPACE(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000094#define SRE_IS_LINEBREAK(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +030095 ((ch) == '\n')
Fredrik Lundh436c3d582000-06-29 08:58:44 +000096#define SRE_IS_WORD(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050097 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
Guido van Rossumb700df92000-03-31 14:59:30 +000098
Serhiy Storchaka3557b052017-10-24 23:31:42 +030099static unsigned int sre_lower_ascii(unsigned int ch)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000100{
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300101 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000102}
103
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000104/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000105/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
106 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000107#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000108#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
109
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000110static unsigned int sre_lower_locale(unsigned int ch)
111{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000112 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000113}
114
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200115static unsigned int sre_upper_locale(unsigned int ch)
116{
117 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
118}
119
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000120/* unicode-specific character predicates */
121
Victor Stinner0058b862011-09-29 03:27:47 +0200122#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
123#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
124#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
125#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
126#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000127
128static unsigned int sre_lower_unicode(unsigned int ch)
129{
Victor Stinner0058b862011-09-29 03:27:47 +0200130 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000131}
132
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200133static unsigned int sre_upper_unicode(unsigned int ch)
134{
135 return (unsigned int) Py_UNICODE_TOUPPER(ch);
136}
137
Guido van Rossumb700df92000-03-31 14:59:30 +0000138LOCAL(int)
139sre_category(SRE_CODE category, unsigned int ch)
140{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000141 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000142
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000143 case SRE_CATEGORY_DIGIT:
144 return SRE_IS_DIGIT(ch);
145 case SRE_CATEGORY_NOT_DIGIT:
146 return !SRE_IS_DIGIT(ch);
147 case SRE_CATEGORY_SPACE:
148 return SRE_IS_SPACE(ch);
149 case SRE_CATEGORY_NOT_SPACE:
150 return !SRE_IS_SPACE(ch);
151 case SRE_CATEGORY_WORD:
152 return SRE_IS_WORD(ch);
153 case SRE_CATEGORY_NOT_WORD:
154 return !SRE_IS_WORD(ch);
155 case SRE_CATEGORY_LINEBREAK:
156 return SRE_IS_LINEBREAK(ch);
157 case SRE_CATEGORY_NOT_LINEBREAK:
158 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000159
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000160 case SRE_CATEGORY_LOC_WORD:
161 return SRE_LOC_IS_WORD(ch);
162 case SRE_CATEGORY_LOC_NOT_WORD:
163 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000165 case SRE_CATEGORY_UNI_DIGIT:
166 return SRE_UNI_IS_DIGIT(ch);
167 case SRE_CATEGORY_UNI_NOT_DIGIT:
168 return !SRE_UNI_IS_DIGIT(ch);
169 case SRE_CATEGORY_UNI_SPACE:
170 return SRE_UNI_IS_SPACE(ch);
171 case SRE_CATEGORY_UNI_NOT_SPACE:
172 return !SRE_UNI_IS_SPACE(ch);
173 case SRE_CATEGORY_UNI_WORD:
174 return SRE_UNI_IS_WORD(ch);
175 case SRE_CATEGORY_UNI_NOT_WORD:
176 return !SRE_UNI_IS_WORD(ch);
177 case SRE_CATEGORY_UNI_LINEBREAK:
178 return SRE_UNI_IS_LINEBREAK(ch);
179 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
180 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000181 }
182 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000183}
184
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300185LOCAL(int)
186char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
187{
188 return ch == pattern
189 || (SRE_CODE) sre_lower_locale(ch) == pattern
190 || (SRE_CODE) sre_upper_locale(ch) == pattern;
191}
192
193
Guido van Rossumb700df92000-03-31 14:59:30 +0000194/* helpers */
195
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000196static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000197data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000198{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000199 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000201 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000202 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000203 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000204}
205
206static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000207data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000208{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000209 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000210 minsize = state->data_stack_base+size;
211 cursize = state->data_stack_size;
212 if (cursize < minsize) {
213 void* stack;
214 cursize = minsize+minsize/4+1024;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200215 TRACE(("allocate/grow stack %zd\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000216 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000217 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000218 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000219 return SRE_ERROR_MEMORY;
220 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000221 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000223 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000224 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000225}
226
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000227/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000228
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300229#define SRE_CHAR Py_UCS1
230#define SIZEOF_SRE_CHAR 1
231#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300232#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000233
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300234/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000235
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300236#define SRE_CHAR Py_UCS2
237#define SIZEOF_SRE_CHAR 2
238#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300239#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000240
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300241/* generate 32-bit unicode version */
242
243#define SRE_CHAR Py_UCS4
244#define SIZEOF_SRE_CHAR 4
245#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300246#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000247
248/* -------------------------------------------------------------------- */
249/* factories and destructors */
250
251/* see sre.h for object declarations */
Victor Stinnerf5587782013-11-15 23:21:11 +0100252static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300253static PyObject *pattern_scanner(PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
Guido van Rossumb700df92000-03-31 14:59:30 +0000254
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300255
256/*[clinic input]
257module _sre
258class _sre.SRE_Pattern "PatternObject *" "&Pattern_Type"
259class _sre.SRE_Match "MatchObject *" "&Match_Type"
260class _sre.SRE_Scanner "ScannerObject *" "&Scanner_Type"
261[clinic start generated code]*/
262/*[clinic end generated code: output=da39a3ee5e6b4b0d input=b0230ec19a0deac8]*/
263
Larry Hastings2d0a69a2015-05-03 14:49:19 -0700264static PyTypeObject Pattern_Type;
265static PyTypeObject Match_Type;
266static PyTypeObject Scanner_Type;
267
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300268/*[clinic input]
269_sre.getcodesize -> int
270[clinic start generated code]*/
271
272static int
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +0300273_sre_getcodesize_impl(PyObject *module)
274/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000275{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300276 return sizeof(SRE_CODE);
Guido van Rossumb700df92000-03-31 14:59:30 +0000277}
278
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300279/*[clinic input]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300280_sre.ascii_iscased -> bool
281
282 character: int
283 /
284
285[clinic start generated code]*/
286
287static int
288_sre_ascii_iscased_impl(PyObject *module, int character)
289/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
290{
291 unsigned int ch = (unsigned int)character;
Sergey Fedoseev7f0d59f2018-09-12 17:49:09 +0500292 return ch < 128 && Py_ISALPHA(ch);
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300293}
294
295/*[clinic input]
296_sre.unicode_iscased -> bool
297
298 character: int
299 /
300
301[clinic start generated code]*/
302
303static int
304_sre_unicode_iscased_impl(PyObject *module, int character)
305/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
306{
307 unsigned int ch = (unsigned int)character;
308 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
309}
310
311/*[clinic input]
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300312_sre.ascii_tolower -> int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300313
314 character: int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300315 /
316
317[clinic start generated code]*/
318
319static int
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300320_sre_ascii_tolower_impl(PyObject *module, int character)
321/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000322{
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300323 return sre_lower_ascii(character);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000324}
325
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300326/*[clinic input]
327_sre.unicode_tolower -> int
328
329 character: int
330 /
331
332[clinic start generated code]*/
333
334static int
335_sre_unicode_tolower_impl(PyObject *module, int character)
336/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
337{
338 return sre_lower_unicode(character);
339}
340
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000341LOCAL(void)
342state_reset(SRE_STATE* state)
343{
animalize4a7f44a2019-02-18 21:26:37 +0800344 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000345 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000346
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000347 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000348 state->lastindex = -1;
349
350 state->repeat = NULL;
351
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000352 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000353}
354
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300355static const void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200356getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300357 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600358 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000359{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000360 /* given a python object, return a data pointer, a length (in
361 characters), and a character size. return NULL if the object
362 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000363
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000364 /* Unicode objects do not support the buffer API. So, get the data
365 directly instead. */
366 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200367 if (PyUnicode_READY(string) == -1)
368 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200370 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300371 *p_isbytes = 0;
372 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000373 }
374
Victor Stinner0058b862011-09-29 03:27:47 +0200375 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300376 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200377 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300378 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000379 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000380
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300381 *p_length = view->len;
382 *p_charsize = 1;
383 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000384
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300385 if (view->buf == NULL) {
386 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
387 PyBuffer_Release(view);
388 view->buf = NULL;
389 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000390 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300391 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000392}
393
394LOCAL(PyObject*)
395state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000396 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000397{
398 /* prepare state object */
399
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000400 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300401 int isbytes, charsize;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300402 const void* ptr;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000403
404 memset(state, 0, sizeof(SRE_STATE));
405
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300406 state->mark = PyMem_New(const void *, pattern->groups * 2);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300407 if (!state->mark) {
408 PyErr_NoMemory();
409 goto err;
410 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000411 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000412 state->lastindex = -1;
413
Benjamin Petersone48944b2012-03-07 14:50:25 -0600414 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300415 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000416 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600417 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000418
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300419 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600420 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200421 "cannot use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600422 goto err;
423 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300424 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600425 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200426 "cannot use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600427 goto err;
428 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000429
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000430 /* adjust boundaries */
431 if (start < 0)
432 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000433 else if (start > length)
434 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000435
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000436 if (end < 0)
437 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000438 else if (end > length)
439 end = length;
440
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300441 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000442 state->charsize = charsize;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200443 state->match_all = 0;
444 state->must_advance = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000445
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000446 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000447
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000448 state->start = (void*) ((char*) ptr + start * state->charsize);
449 state->end = (void*) ((char*) ptr + end * state->charsize);
450
451 Py_INCREF(string);
452 state->string = string;
453 state->pos = start;
454 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000455
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000456 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600457 err:
Ammar Askar06e3a272020-06-01 17:21:43 +0000458 /* We add an explicit cast here because MSVC has a bug when
459 compiling C code where it believes that `const void**` cannot be
460 safely casted to `void*`, see bpo-39943 for details. */
461 PyMem_Del((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300462 state->mark = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600463 if (state->buffer.buf)
464 PyBuffer_Release(&state->buffer);
465 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000466}
467
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000468LOCAL(void)
469state_fini(SRE_STATE* state)
470{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600471 if (state->buffer.buf)
472 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000473 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000474 data_stack_dealloc(state);
Ammar Askar06e3a272020-06-01 17:21:43 +0000475 /* See above PyMem_Del for why we explicitly cast here. */
476 PyMem_Del((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300477 state->mark = NULL;
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000478}
479
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000480/* calculate offset from start of string */
481#define STATE_OFFSET(state, member)\
482 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
483
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000484LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300485getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300486 PyObject* string, Py_ssize_t start, Py_ssize_t end)
487{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300488 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300489 if (PyBytes_CheckExact(string) &&
490 start == 0 && end == PyBytes_GET_SIZE(string)) {
491 Py_INCREF(string);
492 return string;
493 }
494 return PyBytes_FromStringAndSize(
495 (const char *)ptr + start, end - start);
496 }
497 else {
498 return PyUnicode_Substring(string, start, end);
499 }
500}
501
502LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000503state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000505 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000506
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000507 index = (index - 1) * 2;
508
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000509 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000510 if (empty)
511 /* want empty string */
512 i = j = 0;
513 else {
Serhiy Storchaka228b12e2017-01-23 09:47:21 +0200514 Py_RETURN_NONE;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000515 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000516 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000517 i = STATE_OFFSET(state, state->mark[index]);
518 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000519 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000520
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300521 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000522}
523
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000524static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100525pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000526{
527 switch (status) {
528 case SRE_ERROR_RECURSION_LIMIT:
Yury Selivanovf488fb42015-07-03 01:04:23 -0400529 /* This error code seems to be unused. */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000530 PyErr_SetString(
Yury Selivanovf488fb42015-07-03 01:04:23 -0400531 PyExc_RecursionError,
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000532 "maximum recursion limit exceeded"
533 );
534 break;
535 case SRE_ERROR_MEMORY:
536 PyErr_NoMemory();
537 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000538 case SRE_ERROR_INTERRUPTED:
539 /* An exception has already been raised, so let it fly */
540 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541 default:
542 /* other error codes indicate compiler/engine bugs */
543 PyErr_SetString(
544 PyExc_RuntimeError,
545 "internal error in regular expression engine"
546 );
547 }
548}
549
Guido van Rossumb700df92000-03-31 14:59:30 +0000550static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000551pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000552{
Raymond Hettinger027bb632004-05-31 03:09:25 +0000553 if (self->weakreflist != NULL)
554 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000555 Py_XDECREF(self->pattern);
556 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000557 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000558 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +0000559}
560
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300561LOCAL(Py_ssize_t)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200562sre_match(SRE_STATE* state, SRE_CODE* pattern)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300563{
564 if (state->charsize == 1)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200565 return sre_ucs1_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300566 if (state->charsize == 2)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200567 return sre_ucs2_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300568 assert(state->charsize == 4);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200569 return sre_ucs4_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300570}
571
572LOCAL(Py_ssize_t)
573sre_search(SRE_STATE* state, SRE_CODE* pattern)
574{
575 if (state->charsize == 1)
576 return sre_ucs1_search(state, pattern);
577 if (state->charsize == 2)
578 return sre_ucs2_search(state, pattern);
579 assert(state->charsize == 4);
580 return sre_ucs4_search(state, pattern);
581}
582
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300583/*[clinic input]
584_sre.SRE_Pattern.match
585
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200586 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300587 pos: Py_ssize_t = 0
588 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300589
590Matches zero or more characters at the beginning of the string.
591[clinic start generated code]*/
592
Larry Hastings16c51912014-01-07 11:53:01 -0800593static PyObject *
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300594_sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200595 Py_ssize_t pos, Py_ssize_t endpos)
596/*[clinic end generated code: output=ea2d838888510661 input=a2ba191647abebe5]*/
Larry Hastings16c51912014-01-07 11:53:01 -0800597{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000598 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100599 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300600 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000601
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300602 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000603 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000604
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000605 state.ptr = state.start;
606
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000607 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
608
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200609 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000610
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000611 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300612 if (PyErr_Occurred()) {
613 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000614 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300615 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000616
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300617 match = pattern_new_match(self, &state, status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000618 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300619 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000620}
621
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300622/*[clinic input]
623_sre.SRE_Pattern.fullmatch
624
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200625 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300626 pos: Py_ssize_t = 0
627 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300628
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300629Matches against all of the string.
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300630[clinic start generated code]*/
631
632static PyObject *
633_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200634 Py_ssize_t pos, Py_ssize_t endpos)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300635/*[clinic end generated code: output=5833c47782a35f4a input=d9fb03a7625b5828]*/
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200636{
637 SRE_STATE state;
638 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300639 PyObject *match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200640
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300641 if (!state_init(&state, self, string, pos, endpos))
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200642 return NULL;
643
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200644 state.ptr = state.start;
645
646 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
647
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200648 state.match_all = 1;
649 status = sre_match(&state, PatternObject_GetCode(self));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200650
651 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300652 if (PyErr_Occurred()) {
653 state_fini(&state);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200654 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300655 }
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200656
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300657 match = pattern_new_match(self, &state, status);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200658 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300659 return match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200660}
661
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300662/*[clinic input]
663_sre.SRE_Pattern.search
664
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200665 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300666 pos: Py_ssize_t = 0
667 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300668
669Scan through string looking for a match, and return a corresponding match object instance.
670
671Return None if no position in the string matches.
672[clinic start generated code]*/
673
674static PyObject *
675_sre_SRE_Pattern_search_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200676 Py_ssize_t pos, Py_ssize_t endpos)
677/*[clinic end generated code: output=25f302a644e951e8 input=4ae5cb7dc38fed1b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000678{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000679 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100680 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300681 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000682
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300683 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000684 return NULL;
685
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000686 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
687
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300688 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000689
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000690 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
691
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300692 if (PyErr_Occurred()) {
693 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000694 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300695 }
Thomas Wouters89f507f2006-12-13 04:49:30 +0000696
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300697 match = pattern_new_match(self, &state, status);
698 state_fini(&state);
699 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000700}
701
702static PyObject*
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200703call(const char* module, const char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000704{
705 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000706 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000707 PyObject* func;
708 PyObject* result;
709
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000710 if (!args)
711 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000712 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000713 if (!name)
714 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000715 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000716 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000717 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000718 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000719 func = PyObject_GetAttrString(mod, function);
720 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000721 if (!func)
722 return NULL;
723 result = PyObject_CallObject(func, args);
724 Py_DECREF(func);
725 Py_DECREF(args);
726 return result;
727}
728
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300729/*[clinic input]
730_sre.SRE_Pattern.findall
731
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200732 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300733 pos: Py_ssize_t = 0
734 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300735
736Return a list of all non-overlapping matches of pattern in string.
737[clinic start generated code]*/
738
739static PyObject *
740_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200741 Py_ssize_t pos, Py_ssize_t endpos)
742/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000743{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000744 SRE_STATE state;
745 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100746 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000747 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000748
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300749 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000750 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000751
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000752 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000753 if (!list) {
754 state_fini(&state);
755 return NULL;
756 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000757
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000758 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000759
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000760 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000761
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000762 state_reset(&state);
763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000764 state.ptr = state.start;
765
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300766 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300767 if (PyErr_Occurred())
768 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000769
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000770 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000771 if (status == 0)
772 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000773 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000774 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000775 }
Tim Peters3d563502006-01-21 02:47:53 +0000776
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000777 /* don't bother to build a match object */
778 switch (self->groups) {
779 case 0:
780 b = STATE_OFFSET(&state, state.start);
781 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300782 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300783 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000784 if (!item)
785 goto error;
786 break;
787 case 1:
788 item = state_getslice(&state, 1, string, 1);
789 if (!item)
790 goto error;
791 break;
792 default:
793 item = PyTuple_New(self->groups);
794 if (!item)
795 goto error;
796 for (i = 0; i < self->groups; i++) {
797 PyObject* o = state_getslice(&state, i+1, string, 1);
798 if (!o) {
799 Py_DECREF(item);
800 goto error;
801 }
802 PyTuple_SET_ITEM(item, i, o);
803 }
804 break;
805 }
806
807 status = PyList_Append(list, item);
808 Py_DECREF(item);
809 if (status < 0)
810 goto error;
811
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200812 state.must_advance = (state.ptr == state.start);
813 state.start = state.ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000814 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000815
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000816 state_fini(&state);
817 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000818
819error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000820 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000821 state_fini(&state);
822 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000823
Guido van Rossumb700df92000-03-31 14:59:30 +0000824}
825
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300826/*[clinic input]
827_sre.SRE_Pattern.finditer
828
829 string: object
830 pos: Py_ssize_t = 0
831 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
832
833Return an iterator over all non-overlapping matches for the RE pattern in string.
834
835For each match, the iterator returns a match object.
836[clinic start generated code]*/
837
838static PyObject *
839_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyObject *string,
840 Py_ssize_t pos, Py_ssize_t endpos)
841/*[clinic end generated code: output=0bbb1a0aeb38bb14 input=612aab69e9fe08e4]*/
Fredrik Lundh703ce812001-10-24 22:16:30 +0000842{
843 PyObject* scanner;
844 PyObject* search;
845 PyObject* iterator;
846
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300847 scanner = pattern_scanner(self, string, pos, endpos);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000848 if (!scanner)
849 return NULL;
850
851 search = PyObject_GetAttrString(scanner, "search");
852 Py_DECREF(scanner);
853 if (!search)
854 return NULL;
855
856 iterator = PyCallIter_New(search, Py_None);
857 Py_DECREF(search);
858
859 return iterator;
860}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000861
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300862/*[clinic input]
863_sre.SRE_Pattern.scanner
864
865 string: object
866 pos: Py_ssize_t = 0
867 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
868
869[clinic start generated code]*/
870
871static PyObject *
872_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyObject *string,
873 Py_ssize_t pos, Py_ssize_t endpos)
874/*[clinic end generated code: output=54ea548aed33890b input=3aacdbde77a3a637]*/
875{
876 return pattern_scanner(self, string, pos, endpos);
877}
878
879/*[clinic input]
880_sre.SRE_Pattern.split
881
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200882 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300883 maxsplit: Py_ssize_t = 0
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300884
885Split string by the occurrences of pattern.
886[clinic start generated code]*/
887
888static PyObject *
889_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200890 Py_ssize_t maxsplit)
891/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000892{
893 SRE_STATE state;
894 PyObject* list;
895 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100896 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000897 Py_ssize_t n;
898 Py_ssize_t i;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300899 const void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000900
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200901 assert(self->codesize != 0);
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200902
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300903 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000904 return NULL;
905
906 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000907 if (!list) {
908 state_fini(&state);
909 return NULL;
910 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000911
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000912 n = 0;
913 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000914
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000915 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000916
917 state_reset(&state);
918
919 state.ptr = state.start;
920
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300921 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300922 if (PyErr_Occurred())
923 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000924
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000925 if (status <= 0) {
926 if (status == 0)
927 break;
928 pattern_error(status);
929 goto error;
930 }
Tim Peters3d563502006-01-21 02:47:53 +0000931
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000932 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300933 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000934 string, STATE_OFFSET(&state, last),
935 STATE_OFFSET(&state, state.start)
936 );
937 if (!item)
938 goto error;
939 status = PyList_Append(list, item);
940 Py_DECREF(item);
941 if (status < 0)
942 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000943
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000944 /* add groups (if any) */
945 for (i = 0; i < self->groups; i++) {
946 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000947 if (!item)
948 goto error;
949 status = PyList_Append(list, item);
950 Py_DECREF(item);
951 if (status < 0)
952 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000953 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000954
955 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200956 state.must_advance = (state.ptr == state.start);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000957 last = state.start = state.ptr;
958
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000959 }
960
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000961 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300962 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000963 string, STATE_OFFSET(&state, last), state.endpos
964 );
965 if (!item)
966 goto error;
967 status = PyList_Append(list, item);
968 Py_DECREF(item);
969 if (status < 0)
970 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000971
972 state_fini(&state);
973 return list;
974
975error:
976 Py_DECREF(list);
977 state_fini(&state);
978 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000979
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000980}
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000981
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000982static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000983pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000984 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000985{
986 SRE_STATE state;
987 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +0300988 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000989 PyObject* item;
990 PyObject* filter;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000991 PyObject* match;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300992 const void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100993 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000994 Py_ssize_t n;
995 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300996 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000997 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600998 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000999
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001000 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001001 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001002 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001003 Py_INCREF(filter);
1004 filter_is_callable = 1;
1005 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001006 /* if not callable, check if it's a literal string */
1007 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001008 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001009 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001010 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001011 if (charsize == 1)
1012 literal = memchr(ptr, '\\', n) == NULL;
1013 else
1014 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001015 } else {
1016 PyErr_Clear();
1017 literal = 0;
1018 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06001019 if (view.buf)
1020 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001021 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001022 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001023 Py_INCREF(filter);
1024 filter_is_callable = 0;
1025 } else {
1026 /* not a literal; hand it over to the template compiler */
1027 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001028 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001029 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001030 );
1031 if (!filter)
1032 return NULL;
1033 filter_is_callable = PyCallable_Check(filter);
1034 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00001035 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001036
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001037 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001038 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001039 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001040 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001041
1042 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001043 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001044 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001045 state_fini(&state);
1046 return NULL;
1047 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001048
1049 n = i = 0;
1050
1051 while (!count || n < count) {
1052
1053 state_reset(&state);
1054
1055 state.ptr = state.start;
1056
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001057 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001058 if (PyErr_Occurred())
1059 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001060
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001061 if (status <= 0) {
1062 if (status == 0)
1063 break;
1064 pattern_error(status);
1065 goto error;
1066 }
Tim Peters3d563502006-01-21 02:47:53 +00001067
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001068 b = STATE_OFFSET(&state, state.start);
1069 e = STATE_OFFSET(&state, state.ptr);
1070
1071 if (i < b) {
1072 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001073 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001074 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001075 if (!item)
1076 goto error;
1077 status = PyList_Append(list, item);
1078 Py_DECREF(item);
1079 if (status < 0)
1080 goto error;
1081
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001082 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001083
1084 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001085 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001086 match = pattern_new_match(self, &state, 1);
1087 if (!match)
1088 goto error;
Petr Viktorinffd97532020-02-11 17:46:57 +01001089 item = PyObject_CallOneArg(filter, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001090 Py_DECREF(match);
1091 if (!item)
1092 goto error;
1093 } else {
1094 /* filter is literal string */
1095 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001096 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001097 }
1098
1099 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001100 if (item != Py_None) {
1101 status = PyList_Append(list, item);
1102 Py_DECREF(item);
1103 if (status < 0)
1104 goto error;
1105 }
Tim Peters3d563502006-01-21 02:47:53 +00001106
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001107 i = e;
1108 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001109 state.must_advance = (state.ptr == state.start);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001110 state.start = state.ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001111 }
1112
1113 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001114 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001115 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001116 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001117 if (!item)
1118 goto error;
1119 status = PyList_Append(list, item);
1120 Py_DECREF(item);
1121 if (status < 0)
1122 goto error;
1123 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001124
1125 state_fini(&state);
1126
Guido van Rossum4e173842001-12-07 04:25:10 +00001127 Py_DECREF(filter);
1128
Fredrik Lundhdac58492001-10-21 21:48:30 +00001129 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001130 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001131 if (!joiner) {
1132 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001133 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001134 }
1135 if (PyList_GET_SIZE(list) == 0) {
1136 Py_DECREF(list);
1137 item = joiner;
1138 }
1139 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001140 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001141 item = _PyBytes_Join(joiner, list);
1142 else
1143 item = PyUnicode_Join(joiner, list);
1144 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001145 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001146 if (!item)
1147 return NULL;
1148 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001149
1150 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001151 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001152
1153 return item;
1154
1155error:
1156 Py_DECREF(list);
1157 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001158 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001159 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001160
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001161}
1162
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001163/*[clinic input]
1164_sre.SRE_Pattern.sub
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001165
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001166 repl: object
1167 string: object
1168 count: Py_ssize_t = 0
1169
1170Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1171[clinic start generated code]*/
1172
1173static PyObject *
1174_sre_SRE_Pattern_sub_impl(PatternObject *self, PyObject *repl,
1175 PyObject *string, Py_ssize_t count)
1176/*[clinic end generated code: output=1dbf2ec3479cba00 input=c53d70be0b3caf86]*/
1177{
1178 return pattern_subx(self, repl, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001179}
1180
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001181/*[clinic input]
1182_sre.SRE_Pattern.subn
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001183
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001184 repl: object
1185 string: object
1186 count: Py_ssize_t = 0
1187
1188Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1189[clinic start generated code]*/
1190
1191static PyObject *
1192_sre_SRE_Pattern_subn_impl(PatternObject *self, PyObject *repl,
1193 PyObject *string, Py_ssize_t count)
1194/*[clinic end generated code: output=0d9522cd529e9728 input=e7342d7ce6083577]*/
1195{
1196 return pattern_subx(self, repl, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001197}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001198
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001199/*[clinic input]
1200_sre.SRE_Pattern.__copy__
1201
1202[clinic start generated code]*/
1203
1204static PyObject *
1205_sre_SRE_Pattern___copy___impl(PatternObject *self)
1206/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001207{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001208 Py_INCREF(self);
1209 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001210}
1211
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001212/*[clinic input]
1213_sre.SRE_Pattern.__deepcopy__
1214
1215 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001216 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001217
1218[clinic start generated code]*/
1219
1220static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001221_sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1222/*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001223{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001224 Py_INCREF(self);
1225 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001226}
1227
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001228static PyObject *
1229pattern_repr(PatternObject *obj)
1230{
1231 static const struct {
1232 const char *name;
1233 int value;
1234 } flag_names[] = {
1235 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1236 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1237 {"re.LOCALE", SRE_FLAG_LOCALE},
1238 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1239 {"re.DOTALL", SRE_FLAG_DOTALL},
1240 {"re.UNICODE", SRE_FLAG_UNICODE},
1241 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1242 {"re.DEBUG", SRE_FLAG_DEBUG},
1243 {"re.ASCII", SRE_FLAG_ASCII},
1244 };
1245 PyObject *result = NULL;
1246 PyObject *flag_items;
Victor Stinner706768c2014-08-16 01:03:39 +02001247 size_t i;
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001248 int flags = obj->flags;
1249
1250 /* Omit re.UNICODE for valid string patterns. */
1251 if (obj->isbytes == 0 &&
1252 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1253 SRE_FLAG_UNICODE)
1254 flags &= ~SRE_FLAG_UNICODE;
1255
1256 flag_items = PyList_New(0);
1257 if (!flag_items)
1258 return NULL;
1259
1260 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1261 if (flags & flag_names[i].value) {
1262 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1263 if (!item)
1264 goto done;
1265
1266 if (PyList_Append(flag_items, item) < 0) {
1267 Py_DECREF(item);
1268 goto done;
1269 }
1270 Py_DECREF(item);
1271 flags &= ~flag_names[i].value;
1272 }
1273 }
1274 if (flags) {
1275 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1276 if (!item)
1277 goto done;
1278
1279 if (PyList_Append(flag_items, item) < 0) {
1280 Py_DECREF(item);
1281 goto done;
1282 }
1283 Py_DECREF(item);
1284 }
1285
1286 if (PyList_Size(flag_items) > 0) {
1287 PyObject *flags_result;
1288 PyObject *sep = PyUnicode_FromString("|");
1289 if (!sep)
1290 goto done;
1291 flags_result = PyUnicode_Join(sep, flag_items);
1292 Py_DECREF(sep);
1293 if (!flags_result)
1294 goto done;
1295 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1296 obj->pattern, flags_result);
1297 Py_DECREF(flags_result);
1298 }
1299 else {
1300 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1301 }
1302
1303done:
1304 Py_DECREF(flag_items);
1305 return result;
1306}
1307
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001308PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001309
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001310/* PatternObject's 'groupindex' method. */
1311static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02001312pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001313{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001314 if (self->groupindex == NULL)
1315 return PyDict_New();
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001316 return PyDictProxy_New(self->groupindex);
1317}
1318
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001319static int _validate(PatternObject *self); /* Forward */
1320
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001321/*[clinic input]
1322_sre.compile
1323
1324 pattern: object
1325 flags: int
1326 code: object(subclass_of='&PyList_Type')
1327 groups: Py_ssize_t
Victor Stinner726a57d2016-11-22 23:04:39 +01001328 groupindex: object(subclass_of='&PyDict_Type')
1329 indexgroup: object(subclass_of='&PyTuple_Type')
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001330
1331[clinic start generated code]*/
1332
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001333static PyObject *
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +03001334_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001335 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1336 PyObject *indexgroup)
Victor Stinner726a57d2016-11-22 23:04:39 +01001337/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001338{
1339 /* "compile" pattern descriptor to pattern object */
1340
1341 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001342 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001343
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001344 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001345 /* coverity[ampersand_in_size] */
Victor Stinner92055202020-04-08 00:38:15 +02001346 self = PyObject_NewVar(PatternObject, &Pattern_Type, n);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001347 if (!self)
1348 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001349 self->weakreflist = NULL;
1350 self->pattern = NULL;
1351 self->groupindex = NULL;
1352 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001353
1354 self->codesize = n;
1355
1356 for (i = 0; i < n; i++) {
1357 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001358 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359 self->code[i] = (SRE_CODE) value;
1360 if ((unsigned long) self->code[i] != value) {
1361 PyErr_SetString(PyExc_OverflowError,
1362 "regular expression code size limit exceeded");
1363 break;
1364 }
1365 }
1366
1367 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001368 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001369 return NULL;
1370 }
1371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001373 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 else {
1376 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001377 int charsize;
1378 Py_buffer view;
1379 view.buf = NULL;
1380 if (!getstring(pattern, &p_length, &self->isbytes,
1381 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 Py_DECREF(self);
1383 return NULL;
1384 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001385 if (view.buf)
1386 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001388
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389 Py_INCREF(pattern);
1390 self->pattern = pattern;
1391
1392 self->flags = flags;
1393
1394 self->groups = groups;
1395
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001396 if (PyDict_GET_SIZE(groupindex) > 0) {
1397 Py_INCREF(groupindex);
1398 self->groupindex = groupindex;
1399 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1400 Py_INCREF(indexgroup);
1401 self->indexgroup = indexgroup;
1402 }
1403 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001405 if (!_validate(self)) {
1406 Py_DECREF(self);
1407 return NULL;
1408 }
1409
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001410 return (PyObject*) self;
1411}
1412
Guido van Rossumb700df92000-03-31 14:59:30 +00001413/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001414/* Code validation */
1415
1416/* To learn more about this code, have a look at the _compile() function in
1417 Lib/sre_compile.py. The validation functions below checks the code array
1418 for conformance with the code patterns generated there.
1419
1420 The nice thing about the generated code is that it is position-independent:
1421 all jumps are relative jumps forward. Also, jumps don't cross each other:
1422 the target of a later jump is always earlier than the target of an earlier
1423 jump. IOW, this is okay:
1424
1425 J---------J-------T--------T
1426 \ \_____/ /
1427 \______________________/
1428
1429 but this is not:
1430
1431 J---------J-------T--------T
1432 \_________\_____/ /
1433 \____________/
1434
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001435 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001436*/
1437
1438/* Defining this one enables tracing of the validator */
1439#undef VVERBOSE
1440
1441/* Trace macro for the validator */
1442#if defined(VVERBOSE)
1443#define VTRACE(v) printf v
1444#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001445#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001446#endif
1447
1448/* Report failure */
1449#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1450
1451/* Extract opcode, argument, or skip count from code array */
1452#define GET_OP \
1453 do { \
1454 VTRACE(("%p: ", code)); \
1455 if (code >= end) FAIL; \
1456 op = *code++; \
1457 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1458 } while (0)
1459#define GET_ARG \
1460 do { \
1461 VTRACE(("%p= ", code)); \
1462 if (code >= end) FAIL; \
1463 arg = *code++; \
1464 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1465 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001466#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001467 do { \
1468 VTRACE(("%p= ", code)); \
1469 if (code >= end) FAIL; \
1470 skip = *code; \
1471 VTRACE(("%lu (skip to %p)\n", \
1472 (unsigned long)skip, code+skip)); \
Benjamin Petersonca470632016-09-06 13:47:26 -07001473 if (skip-adj > (uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001474 FAIL; \
1475 code++; \
1476 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001477#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001478
1479static int
1480_validate_charset(SRE_CODE *code, SRE_CODE *end)
1481{
1482 /* Some variables are manipulated by the macros above */
1483 SRE_CODE op;
1484 SRE_CODE arg;
1485 SRE_CODE offset;
1486 int i;
1487
1488 while (code < end) {
1489 GET_OP;
1490 switch (op) {
1491
1492 case SRE_OP_NEGATE:
1493 break;
1494
1495 case SRE_OP_LITERAL:
1496 GET_ARG;
1497 break;
1498
1499 case SRE_OP_RANGE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001500 case SRE_OP_RANGE_UNI_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001501 GET_ARG;
1502 GET_ARG;
1503 break;
1504
1505 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001506 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Benjamin Petersonca470632016-09-06 13:47:26 -07001507 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001508 FAIL;
1509 code += offset;
1510 break;
1511
1512 case SRE_OP_BIGCHARSET:
1513 GET_ARG; /* Number of blocks */
1514 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001515 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001516 FAIL;
1517 /* Make sure that each byte points to a valid block */
1518 for (i = 0; i < 256; i++) {
1519 if (((unsigned char *)code)[i] >= arg)
1520 FAIL;
1521 }
1522 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001523 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Benjamin Petersonca470632016-09-06 13:47:26 -07001524 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001525 FAIL;
1526 code += offset;
1527 break;
1528
1529 case SRE_OP_CATEGORY:
1530 GET_ARG;
1531 switch (arg) {
1532 case SRE_CATEGORY_DIGIT:
1533 case SRE_CATEGORY_NOT_DIGIT:
1534 case SRE_CATEGORY_SPACE:
1535 case SRE_CATEGORY_NOT_SPACE:
1536 case SRE_CATEGORY_WORD:
1537 case SRE_CATEGORY_NOT_WORD:
1538 case SRE_CATEGORY_LINEBREAK:
1539 case SRE_CATEGORY_NOT_LINEBREAK:
1540 case SRE_CATEGORY_LOC_WORD:
1541 case SRE_CATEGORY_LOC_NOT_WORD:
1542 case SRE_CATEGORY_UNI_DIGIT:
1543 case SRE_CATEGORY_UNI_NOT_DIGIT:
1544 case SRE_CATEGORY_UNI_SPACE:
1545 case SRE_CATEGORY_UNI_NOT_SPACE:
1546 case SRE_CATEGORY_UNI_WORD:
1547 case SRE_CATEGORY_UNI_NOT_WORD:
1548 case SRE_CATEGORY_UNI_LINEBREAK:
1549 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1550 break;
1551 default:
1552 FAIL;
1553 }
1554 break;
1555
1556 default:
1557 FAIL;
1558
1559 }
1560 }
1561
1562 return 1;
1563}
1564
1565static int
1566_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1567{
1568 /* Some variables are manipulated by the macros above */
1569 SRE_CODE op;
1570 SRE_CODE arg;
1571 SRE_CODE skip;
1572
1573 VTRACE(("code=%p, end=%p\n", code, end));
1574
1575 if (code > end)
1576 FAIL;
1577
1578 while (code < end) {
1579 GET_OP;
1580 switch (op) {
1581
1582 case SRE_OP_MARK:
1583 /* We don't check whether marks are properly nested; the
1584 sre_match() code is robust even if they don't, and the worst
1585 you can get is nonsensical match results. */
1586 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001587 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001588 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1589 FAIL;
1590 }
1591 break;
1592
1593 case SRE_OP_LITERAL:
1594 case SRE_OP_NOT_LITERAL:
1595 case SRE_OP_LITERAL_IGNORE:
1596 case SRE_OP_NOT_LITERAL_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001597 case SRE_OP_LITERAL_UNI_IGNORE:
1598 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001599 case SRE_OP_LITERAL_LOC_IGNORE:
1600 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001601 GET_ARG;
1602 /* The arg is just a character, nothing to check */
1603 break;
1604
1605 case SRE_OP_SUCCESS:
1606 case SRE_OP_FAILURE:
1607 /* Nothing to check; these normally end the matching process */
1608 break;
1609
1610 case SRE_OP_AT:
1611 GET_ARG;
1612 switch (arg) {
1613 case SRE_AT_BEGINNING:
1614 case SRE_AT_BEGINNING_STRING:
1615 case SRE_AT_BEGINNING_LINE:
1616 case SRE_AT_END:
1617 case SRE_AT_END_LINE:
1618 case SRE_AT_END_STRING:
1619 case SRE_AT_BOUNDARY:
1620 case SRE_AT_NON_BOUNDARY:
1621 case SRE_AT_LOC_BOUNDARY:
1622 case SRE_AT_LOC_NON_BOUNDARY:
1623 case SRE_AT_UNI_BOUNDARY:
1624 case SRE_AT_UNI_NON_BOUNDARY:
1625 break;
1626 default:
1627 FAIL;
1628 }
1629 break;
1630
1631 case SRE_OP_ANY:
1632 case SRE_OP_ANY_ALL:
1633 /* These have no operands */
1634 break;
1635
1636 case SRE_OP_IN:
1637 case SRE_OP_IN_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001638 case SRE_OP_IN_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001639 case SRE_OP_IN_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001640 GET_SKIP;
1641 /* Stop 1 before the end; we check the FAILURE below */
1642 if (!_validate_charset(code, code+skip-2))
1643 FAIL;
1644 if (code[skip-2] != SRE_OP_FAILURE)
1645 FAIL;
1646 code += skip-1;
1647 break;
1648
1649 case SRE_OP_INFO:
1650 {
1651 /* A minimal info field is
1652 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1653 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1654 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001655 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001656 SRE_CODE *newcode;
1657 GET_SKIP;
1658 newcode = code+skip-1;
1659 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001660 GET_ARG;
1661 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001662 /* Check that only valid flags are present */
1663 if ((flags & ~(SRE_INFO_PREFIX |
1664 SRE_INFO_LITERAL |
1665 SRE_INFO_CHARSET)) != 0)
1666 FAIL;
1667 /* PREFIX and CHARSET are mutually exclusive */
1668 if ((flags & SRE_INFO_PREFIX) &&
1669 (flags & SRE_INFO_CHARSET))
1670 FAIL;
1671 /* LITERAL implies PREFIX */
1672 if ((flags & SRE_INFO_LITERAL) &&
1673 !(flags & SRE_INFO_PREFIX))
1674 FAIL;
1675 /* Validate the prefix */
1676 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001677 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001678 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001679 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001680 /* Here comes the prefix string */
Benjamin Petersonca470632016-09-06 13:47:26 -07001681 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001682 FAIL;
1683 code += prefix_len;
1684 /* And here comes the overlap table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001685 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001686 FAIL;
1687 /* Each overlap value should be < prefix_len */
1688 for (i = 0; i < prefix_len; i++) {
1689 if (code[i] >= prefix_len)
1690 FAIL;
1691 }
1692 code += prefix_len;
1693 }
1694 /* Validate the charset */
1695 if (flags & SRE_INFO_CHARSET) {
1696 if (!_validate_charset(code, newcode-1))
1697 FAIL;
1698 if (newcode[-1] != SRE_OP_FAILURE)
1699 FAIL;
1700 code = newcode;
1701 }
1702 else if (code != newcode) {
1703 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1704 FAIL;
1705 }
1706 }
1707 break;
1708
1709 case SRE_OP_BRANCH:
1710 {
1711 SRE_CODE *target = NULL;
1712 for (;;) {
1713 GET_SKIP;
1714 if (skip == 0)
1715 break;
1716 /* Stop 2 before the end; we check the JUMP below */
1717 if (!_validate_inner(code, code+skip-3, groups))
1718 FAIL;
1719 code += skip-3;
1720 /* Check that it ends with a JUMP, and that each JUMP
1721 has the same target */
1722 GET_OP;
1723 if (op != SRE_OP_JUMP)
1724 FAIL;
1725 GET_SKIP;
1726 if (target == NULL)
1727 target = code+skip-1;
1728 else if (code+skip-1 != target)
1729 FAIL;
1730 }
1731 }
1732 break;
1733
1734 case SRE_OP_REPEAT_ONE:
1735 case SRE_OP_MIN_REPEAT_ONE:
1736 {
1737 SRE_CODE min, max;
1738 GET_SKIP;
1739 GET_ARG; min = arg;
1740 GET_ARG; max = arg;
1741 if (min > max)
1742 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001743 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001744 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001745 if (!_validate_inner(code, code+skip-4, groups))
1746 FAIL;
1747 code += skip-4;
1748 GET_OP;
1749 if (op != SRE_OP_SUCCESS)
1750 FAIL;
1751 }
1752 break;
1753
1754 case SRE_OP_REPEAT:
1755 {
1756 SRE_CODE min, max;
1757 GET_SKIP;
1758 GET_ARG; min = arg;
1759 GET_ARG; max = arg;
1760 if (min > max)
1761 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001762 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001763 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001764 if (!_validate_inner(code, code+skip-3, groups))
1765 FAIL;
1766 code += skip-3;
1767 GET_OP;
1768 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1769 FAIL;
1770 }
1771 break;
1772
1773 case SRE_OP_GROUPREF:
1774 case SRE_OP_GROUPREF_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001775 case SRE_OP_GROUPREF_UNI_IGNORE:
1776 case SRE_OP_GROUPREF_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001777 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001778 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001779 FAIL;
1780 break;
1781
1782 case SRE_OP_GROUPREF_EXISTS:
1783 /* The regex syntax for this is: '(?(group)then|else)', where
1784 'group' is either an integer group number or a group name,
1785 'then' and 'else' are sub-regexes, and 'else' is optional. */
1786 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001787 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001788 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001789 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001790 code--; /* The skip is relative to the first arg! */
1791 /* There are two possibilities here: if there is both a 'then'
1792 part and an 'else' part, the generated code looks like:
1793
1794 GROUPREF_EXISTS
1795 <group>
1796 <skipyes>
1797 ...then part...
1798 JUMP
1799 <skipno>
1800 (<skipyes> jumps here)
1801 ...else part...
1802 (<skipno> jumps here)
1803
1804 If there is only a 'then' part, it looks like:
1805
1806 GROUPREF_EXISTS
1807 <group>
1808 <skip>
1809 ...then part...
1810 (<skip> jumps here)
1811
1812 There is no direct way to decide which it is, and we don't want
1813 to allow arbitrary jumps anywhere in the code; so we just look
1814 for a JUMP opcode preceding our skip target.
1815 */
Benjamin Petersonca470632016-09-06 13:47:26 -07001816 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001817 code[skip-3] == SRE_OP_JUMP)
1818 {
1819 VTRACE(("both then and else parts present\n"));
1820 if (!_validate_inner(code+1, code+skip-3, groups))
1821 FAIL;
1822 code += skip-2; /* Position after JUMP, at <skipno> */
1823 GET_SKIP;
1824 if (!_validate_inner(code, code+skip-1, groups))
1825 FAIL;
1826 code += skip-1;
1827 }
1828 else {
1829 VTRACE(("only a then part present\n"));
1830 if (!_validate_inner(code+1, code+skip-1, groups))
1831 FAIL;
1832 code += skip-1;
1833 }
1834 break;
1835
1836 case SRE_OP_ASSERT:
1837 case SRE_OP_ASSERT_NOT:
1838 GET_SKIP;
1839 GET_ARG; /* 0 for lookahead, width for lookbehind */
1840 code--; /* Back up over arg to simplify math below */
1841 if (arg & 0x80000000)
1842 FAIL; /* Width too large */
1843 /* Stop 1 before the end; we check the SUCCESS below */
1844 if (!_validate_inner(code+1, code+skip-2, groups))
1845 FAIL;
1846 code += skip-2;
1847 GET_OP;
1848 if (op != SRE_OP_SUCCESS)
1849 FAIL;
1850 break;
1851
1852 default:
1853 FAIL;
1854
1855 }
1856 }
1857
1858 VTRACE(("okay\n"));
1859 return 1;
1860}
1861
1862static int
1863_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1864{
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001865 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1866 code >= end || end[-1] != SRE_OP_SUCCESS)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001867 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001868 return _validate_inner(code, end-1, groups);
1869}
1870
1871static int
1872_validate(PatternObject *self)
1873{
1874 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1875 {
1876 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1877 return 0;
1878 }
1879 else
1880 VTRACE(("Success!\n"));
1881 return 1;
1882}
1883
1884/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001885/* match methods */
1886
1887static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001888match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001889{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001890 Py_XDECREF(self->regs);
1891 Py_XDECREF(self->string);
1892 Py_DECREF(self->pattern);
1893 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001894}
1895
1896static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001897match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001898{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001899 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001900 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001901 Py_buffer view;
1902 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001903 const void* ptr;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001904 Py_ssize_t i, j;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001905
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001906 assert(0 <= index && index < self->groups);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001907 index *= 2;
1908
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001909 if (self->string == Py_None || self->mark[index] < 0) {
1910 /* return default value if the string or group is undefined */
1911 Py_INCREF(def);
1912 return def;
1913 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001914
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001915 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001916 if (ptr == NULL)
1917 return NULL;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001918
1919 i = self->mark[index];
1920 j = self->mark[index+1];
1921 i = Py_MIN(i, length);
1922 j = Py_MIN(j, length);
1923 result = getslice(isbytes, ptr, self->string, i, j);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001924 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001925 PyBuffer_Release(&view);
1926 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001927}
1928
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001929static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001930match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001931{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001932 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001933
Guido van Rossumddefaf32007-01-14 03:31:43 +00001934 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001935 /* Default value */
1936 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00001937
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001938 if (PyIndex_Check(index)) {
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001939 i = PyNumber_AsSsize_t(index, NULL);
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001940 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001941 else {
1942 i = -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001943
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001944 if (self->pattern->groupindex) {
1945 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
1946 if (index && PyLong_Check(index)) {
1947 i = PyLong_AsSsize_t(index);
1948 }
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001949 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001950 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001951 if (i < 0 || i >= self->groups) {
1952 /* raise IndexError if we were given a bad group number */
1953 if (!PyErr_Occurred()) {
1954 PyErr_SetString(PyExc_IndexError, "no such group");
1955 }
1956 return -1;
1957 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001958
1959 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001960}
1961
1962static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001963match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001964{
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001965 Py_ssize_t i = match_getindex(self, index);
1966
1967 if (i < 0) {
1968 return NULL;
1969 }
1970
1971 return match_getslice_by_index(self, i, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001972}
1973
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001974/*[clinic input]
1975_sre.SRE_Match.expand
1976
1977 template: object
1978
1979Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
1980[clinic start generated code]*/
1981
1982static PyObject *
1983_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
1984/*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001985{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001986 /* delegate to Python code */
1987 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001988 SRE_PY_MODULE, "_expand",
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001989 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001990 );
1991}
1992
1993static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001994match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001995{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001996 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001997 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001998
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001999 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002000
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002001 switch (size) {
2002 case 0:
Victor Stinner37834132020-10-27 17:12:53 +01002003 result = match_getslice(self, _PyLong_GetZero(), Py_None);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002004 break;
2005 case 1:
2006 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2007 break;
2008 default:
2009 /* fetch multiple items */
2010 result = PyTuple_New(size);
2011 if (!result)
2012 return NULL;
2013 for (i = 0; i < size; i++) {
2014 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002015 self, PyTuple_GET_ITEM(args, i), Py_None
2016 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002017 if (!item) {
2018 Py_DECREF(result);
2019 return NULL;
2020 }
2021 PyTuple_SET_ITEM(result, i, item);
2022 }
2023 break;
2024 }
2025 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002026}
2027
Eric V. Smith605bdae2016-09-11 08:55:43 -04002028static PyObject*
2029match_getitem(MatchObject* self, PyObject* name)
2030{
2031 return match_getslice(self, name, Py_None);
2032}
2033
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002034/*[clinic input]
2035_sre.SRE_Match.groups
2036
2037 default: object = None
2038 Is used for groups that did not participate in the match.
2039
2040Return a tuple containing all the subgroups of the match, from 1.
2041[clinic start generated code]*/
2042
2043static PyObject *
2044_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2045/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002046{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002047 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002048 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002049
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002050 result = PyTuple_New(self->groups-1);
2051 if (!result)
2052 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 for (index = 1; index < self->groups; index++) {
2055 PyObject* item;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002056 item = match_getslice_by_index(self, index, default_value);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 if (!item) {
2058 Py_DECREF(result);
2059 return NULL;
2060 }
2061 PyTuple_SET_ITEM(result, index-1, item);
2062 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002065}
2066
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002067/*[clinic input]
2068_sre.SRE_Match.groupdict
2069
2070 default: object = None
2071 Is used for groups that did not participate in the match.
2072
2073Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2074[clinic start generated code]*/
2075
2076static PyObject *
2077_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2078/*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002079{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002080 PyObject *result;
2081 PyObject *key;
2082 PyObject *value;
2083 Py_ssize_t pos = 0;
2084 Py_hash_t hash;
Guido van Rossumb700df92000-03-31 14:59:30 +00002085
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 result = PyDict_New();
2087 if (!result || !self->pattern->groupindex)
2088 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002089
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002090 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002091 int status;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002092 Py_INCREF(key);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002093 value = match_getslice(self, key, default_value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002094 if (!value) {
2095 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002096 goto failed;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002097 }
2098 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002099 Py_DECREF(value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002100 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002101 if (status < 0)
2102 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002103 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002104
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002105 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002106
2107failed:
Fredrik Lundh770617b2001-01-14 15:06:11 +00002108 Py_DECREF(result);
2109 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002110}
2111
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002112/*[clinic input]
2113_sre.SRE_Match.start -> Py_ssize_t
2114
2115 group: object(c_default="NULL") = 0
2116 /
2117
2118Return index of the start of the substring matched by group.
2119[clinic start generated code]*/
2120
2121static Py_ssize_t
2122_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2123/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002124{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002125 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002126
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002127 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002128 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002129 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002130
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002131 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002132 return self->mark[index*2];
Guido van Rossumb700df92000-03-31 14:59:30 +00002133}
2134
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002135/*[clinic input]
2136_sre.SRE_Match.end -> Py_ssize_t
2137
2138 group: object(c_default="NULL") = 0
2139 /
2140
2141Return index of the end of the substring matched by group.
2142[clinic start generated code]*/
2143
2144static Py_ssize_t
2145_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2146/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002147{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002148 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002149
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002150 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002151 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002152 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002153
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002154 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002155 return self->mark[index*2+1];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002156}
2157
2158LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002159_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002160{
2161 PyObject* pair;
2162 PyObject* item;
2163
2164 pair = PyTuple_New(2);
2165 if (!pair)
2166 return NULL;
2167
Christian Heimes217cfd12007-12-02 14:31:20 +00002168 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002169 if (!item)
2170 goto error;
2171 PyTuple_SET_ITEM(pair, 0, item);
2172
Christian Heimes217cfd12007-12-02 14:31:20 +00002173 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002174 if (!item)
2175 goto error;
2176 PyTuple_SET_ITEM(pair, 1, item);
2177
2178 return pair;
2179
2180 error:
2181 Py_DECREF(pair);
2182 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002183}
2184
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002185/*[clinic input]
2186_sre.SRE_Match.span
2187
2188 group: object(c_default="NULL") = 0
2189 /
2190
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002191For match object m, return the 2-tuple (m.start(group), m.end(group)).
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002192[clinic start generated code]*/
2193
2194static PyObject *
2195_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002196/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002197{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002198 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002199
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002200 if (index < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002201 return NULL;
2202 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002203
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002204 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002205 return _pair(self->mark[index*2], self->mark[index*2+1]);
2206}
2207
2208static PyObject*
2209match_regs(MatchObject* self)
2210{
2211 PyObject* regs;
2212 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002213 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002214
2215 regs = PyTuple_New(self->groups);
2216 if (!regs)
2217 return NULL;
2218
2219 for (index = 0; index < self->groups; index++) {
2220 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2221 if (!item) {
2222 Py_DECREF(regs);
2223 return NULL;
2224 }
2225 PyTuple_SET_ITEM(regs, index, item);
2226 }
2227
2228 Py_INCREF(regs);
2229 self->regs = regs;
2230
2231 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002232}
2233
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002234/*[clinic input]
2235_sre.SRE_Match.__copy__
2236
2237[clinic start generated code]*/
2238
2239static PyObject *
2240_sre_SRE_Match___copy___impl(MatchObject *self)
2241/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002242{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002243 Py_INCREF(self);
2244 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002245}
2246
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002247/*[clinic input]
2248_sre.SRE_Match.__deepcopy__
2249
2250 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002251 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002252
2253[clinic start generated code]*/
2254
2255static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002256_sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2257/*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002258{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002259 Py_INCREF(self);
2260 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002261}
2262
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002263PyDoc_STRVAR(match_doc,
2264"The result of re.match() and re.search().\n\
2265Match objects always have a boolean value of True.");
2266
2267PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002268"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002269 Return subgroup(s) of the match by indices or names.\n\
2270 For 0 returns the entire match.");
2271
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002272static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002273match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
Guido van Rossumb700df92000-03-31 14:59:30 +00002274{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002275 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002276 return PyLong_FromSsize_t(self->lastindex);
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002277 Py_RETURN_NONE;
Guido van Rossumb700df92000-03-31 14:59:30 +00002278}
2279
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002280static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002281match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002282{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002283 if (self->pattern->indexgroup &&
2284 self->lastindex >= 0 &&
2285 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2286 {
2287 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2288 self->lastindex);
2289 Py_INCREF(result);
2290 return result;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002291 }
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002292 Py_RETURN_NONE;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002293}
2294
2295static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002296match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002297{
2298 if (self->regs) {
2299 Py_INCREF(self->regs);
2300 return self->regs;
2301 } else
2302 return match_regs(self);
2303}
2304
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002305static PyObject *
2306match_repr(MatchObject *self)
2307{
2308 PyObject *result;
2309 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2310 if (group0 == NULL)
2311 return NULL;
2312 result = PyUnicode_FromFormat(
sth8b91eda2019-03-10 11:29:14 +01002313 "<%s object; span=(%zd, %zd), match=%.50R>",
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002314 Py_TYPE(self)->tp_name,
2315 self->mark[0], self->mark[1], group0);
2316 Py_DECREF(group0);
2317 return result;
2318}
2319
2320
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002321static PyObject*
Victor Stinnerf5587782013-11-15 23:21:11 +01002322pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002323{
2324 /* create match object (from state object) */
2325
2326 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002327 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002328 char* base;
2329 int n;
2330
2331 if (status > 0) {
2332
2333 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002334 /* coverity[ampersand_in_size] */
Victor Stinner92055202020-04-08 00:38:15 +02002335 match = PyObject_NewVar(MatchObject, &Match_Type,
2336 2*(pattern->groups+1));
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002337 if (!match)
2338 return NULL;
2339
2340 Py_INCREF(pattern);
2341 match->pattern = pattern;
2342
2343 Py_INCREF(state->string);
2344 match->string = state->string;
2345
2346 match->regs = NULL;
2347 match->groups = pattern->groups+1;
2348
2349 /* fill in group slices */
2350
2351 base = (char*) state->beginning;
2352 n = state->charsize;
2353
2354 match->mark[0] = ((char*) state->start - base) / n;
2355 match->mark[1] = ((char*) state->ptr - base) / n;
2356
2357 for (i = j = 0; i < pattern->groups; i++, j+=2)
2358 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2359 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2360 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2361 } else
2362 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2363
2364 match->pos = state->pos;
2365 match->endpos = state->endpos;
2366
2367 match->lastindex = state->lastindex;
2368
2369 return (PyObject*) match;
2370
2371 } else if (status == 0) {
2372
2373 /* no match */
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002374 Py_RETURN_NONE;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002375
2376 }
2377
2378 /* internal error */
2379 pattern_error(status);
2380 return NULL;
2381}
2382
2383
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002384/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002385/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002386
2387static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002388scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002389{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002390 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002391 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002392 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002393}
2394
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002395/*[clinic input]
2396_sre.SRE_Scanner.match
2397
2398[clinic start generated code]*/
2399
2400static PyObject *
2401_sre_SRE_Scanner_match_impl(ScannerObject *self)
2402/*[clinic end generated code: output=936b30c63d4b81eb input=881a0154f8c13d9a]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002403{
2404 SRE_STATE* state = &self->state;
2405 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002406 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002407
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002408 if (state->start == NULL)
2409 Py_RETURN_NONE;
2410
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002411 state_reset(state);
2412
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002413 state->ptr = state->start;
2414
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002415 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002416 if (PyErr_Occurred())
2417 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002418
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002419 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002420 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002421
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002422 if (status == 0)
2423 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002424 else {
2425 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002426 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002427 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002428
2429 return match;
2430}
2431
2432
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002433/*[clinic input]
2434_sre.SRE_Scanner.search
2435
2436[clinic start generated code]*/
2437
2438static PyObject *
2439_sre_SRE_Scanner_search_impl(ScannerObject *self)
2440/*[clinic end generated code: output=7dc211986088f025 input=161223ee92ef9270]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002441{
2442 SRE_STATE* state = &self->state;
2443 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002444 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002445
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002446 if (state->start == NULL)
2447 Py_RETURN_NONE;
2448
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002449 state_reset(state);
2450
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002451 state->ptr = state->start;
2452
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002453 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002454 if (PyErr_Occurred())
2455 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002456
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002457 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002458 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002459
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002460 if (status == 0)
2461 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002462 else {
2463 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002464 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002465 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002466
2467 return match;
2468}
2469
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002470static PyObject *
2471pattern_scanner(PatternObject *self, PyObject *string, Py_ssize_t pos, Py_ssize_t endpos)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002472{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002473 ScannerObject* scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002474
2475 /* create scanner object */
Victor Stinner92055202020-04-08 00:38:15 +02002476 scanner = PyObject_New(ScannerObject, &Scanner_Type);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002477 if (!scanner)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002478 return NULL;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002479 scanner->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002480
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002481 /* create search state object */
2482 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2483 Py_DECREF(scanner);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002484 return NULL;
2485 }
2486
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002487 Py_INCREF(self);
2488 scanner->pattern = (PyObject*) self;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002489
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002490 return (PyObject*) scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002491}
2492
Victor Stinnerb44fb122016-11-21 16:35:08 +01002493static Py_hash_t
2494pattern_hash(PatternObject *self)
2495{
2496 Py_hash_t hash, hash2;
2497
2498 hash = PyObject_Hash(self->pattern);
2499 if (hash == -1) {
2500 return -1;
2501 }
2502
2503 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2504 hash ^= hash2;
2505
2506 hash ^= self->flags;
2507 hash ^= self->isbytes;
2508 hash ^= self->codesize;
2509
2510 if (hash == -1) {
2511 hash = -2;
2512 }
2513 return hash;
2514}
2515
2516static PyObject*
2517pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2518{
2519 PatternObject *left, *right;
2520 int cmp;
2521
2522 if (op != Py_EQ && op != Py_NE) {
2523 Py_RETURN_NOTIMPLEMENTED;
2524 }
2525
Dong-hee Na1b55b652020-02-17 19:09:15 +09002526 if (!Py_IS_TYPE(lefto, &Pattern_Type) || !Py_IS_TYPE(righto, &Pattern_Type)) {
Victor Stinnerb44fb122016-11-21 16:35:08 +01002527 Py_RETURN_NOTIMPLEMENTED;
2528 }
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01002529
2530 if (lefto == righto) {
2531 /* a pattern is equal to itself */
2532 return PyBool_FromLong(op == Py_EQ);
2533 }
2534
Victor Stinnerb44fb122016-11-21 16:35:08 +01002535 left = (PatternObject *)lefto;
2536 right = (PatternObject *)righto;
2537
2538 cmp = (left->flags == right->flags
2539 && left->isbytes == right->isbytes
Victor Stinnere670b2d2016-11-22 15:23:00 +01002540 && left->codesize == right->codesize);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002541 if (cmp) {
2542 /* Compare the code and the pattern because the same pattern can
2543 produce different codes depending on the locale used to compile the
2544 pattern when the re.LOCALE flag is used. Don't compare groups,
2545 indexgroup nor groupindex: they are derivated from the pattern. */
2546 cmp = (memcmp(left->code, right->code,
2547 sizeof(left->code[0]) * left->codesize) == 0);
2548 }
2549 if (cmp) {
2550 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2551 Py_EQ);
2552 if (cmp < 0) {
2553 return NULL;
2554 }
2555 }
2556 if (op == Py_NE) {
2557 cmp = !cmp;
2558 }
2559 return PyBool_FromLong(cmp);
2560}
2561
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002562#include "clinic/_sre.c.h"
2563
2564static PyMethodDef pattern_methods[] = {
2565 _SRE_SRE_PATTERN_MATCH_METHODDEF
2566 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2567 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2568 _SRE_SRE_PATTERN_SUB_METHODDEF
2569 _SRE_SRE_PATTERN_SUBN_METHODDEF
2570 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2571 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2572 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2573 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2574 _SRE_SRE_PATTERN___COPY___METHODDEF
2575 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002576 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2577 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002578 {NULL, NULL}
2579};
2580
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002581static PyGetSetDef pattern_getset[] = {
2582 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2583 "A dictionary mapping group names to group numbers."},
2584 {NULL} /* Sentinel */
2585};
2586
2587#define PAT_OFF(x) offsetof(PatternObject, x)
2588static PyMemberDef pattern_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002589 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2590 "The pattern string from which the RE object was compiled."},
2591 {"flags", T_INT, PAT_OFF(flags), READONLY,
2592 "The regex matching flags."},
2593 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2594 "The number of capturing groups in the pattern."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002595 {NULL} /* Sentinel */
2596};
2597
2598static PyTypeObject Pattern_Type = {
2599 PyVarObject_HEAD_INIT(NULL, 0)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002600 "re.Pattern",
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002601 sizeof(PatternObject), sizeof(SRE_CODE),
2602 (destructor)pattern_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002603 0, /* tp_vectorcall_offset */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002604 0, /* tp_getattr */
2605 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002606 0, /* tp_as_async */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002607 (reprfunc)pattern_repr, /* tp_repr */
2608 0, /* tp_as_number */
2609 0, /* tp_as_sequence */
2610 0, /* tp_as_mapping */
Victor Stinnerb44fb122016-11-21 16:35:08 +01002611 (hashfunc)pattern_hash, /* tp_hash */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002612 0, /* tp_call */
2613 0, /* tp_str */
2614 0, /* tp_getattro */
2615 0, /* tp_setattro */
2616 0, /* tp_as_buffer */
2617 Py_TPFLAGS_DEFAULT, /* tp_flags */
2618 pattern_doc, /* tp_doc */
2619 0, /* tp_traverse */
2620 0, /* tp_clear */
Victor Stinnerb44fb122016-11-21 16:35:08 +01002621 pattern_richcompare, /* tp_richcompare */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002622 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2623 0, /* tp_iter */
2624 0, /* tp_iternext */
2625 pattern_methods, /* tp_methods */
2626 pattern_members, /* tp_members */
2627 pattern_getset, /* tp_getset */
2628};
2629
Eric V. Smith605bdae2016-09-11 08:55:43 -04002630/* Match objects do not support length or assignment, but do support
2631 __getitem__. */
2632static PyMappingMethods match_as_mapping = {
2633 NULL,
2634 (binaryfunc)match_getitem,
2635 NULL
2636};
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002637
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002638static PyMethodDef match_methods[] = {
2639 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2640 _SRE_SRE_MATCH_START_METHODDEF
2641 _SRE_SRE_MATCH_END_METHODDEF
2642 _SRE_SRE_MATCH_SPAN_METHODDEF
2643 _SRE_SRE_MATCH_GROUPS_METHODDEF
2644 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2645 _SRE_SRE_MATCH_EXPAND_METHODDEF
2646 _SRE_SRE_MATCH___COPY___METHODDEF
2647 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002648 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2649 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002650 {NULL, NULL}
2651};
2652
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002653static PyGetSetDef match_getset[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002654 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2655 "The integer index of the last matched capturing group."},
2656 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2657 "The name of the last matched capturing group."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002658 {"regs", (getter)match_regs_get, (setter)NULL},
2659 {NULL}
2660};
2661
2662#define MATCH_OFF(x) offsetof(MatchObject, x)
2663static PyMemberDef match_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002664 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2665 "The string passed to match() or search()."},
2666 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2667 "The regular expression object."},
2668 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2669 "The index into the string at which the RE engine started looking for a match."},
2670 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2671 "The index into the string beyond which the RE engine will not go."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002672 {NULL}
2673};
2674
2675/* FIXME: implement setattr("string", None) as a special case (to
2676 detach the associated string, if any */
2677
2678static PyTypeObject Match_Type = {
2679 PyVarObject_HEAD_INIT(NULL,0)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002680 "re.Match",
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002681 sizeof(MatchObject), sizeof(Py_ssize_t),
2682 (destructor)match_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002683 0, /* tp_vectorcall_offset */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002684 0, /* tp_getattr */
2685 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002686 0, /* tp_as_async */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002687 (reprfunc)match_repr, /* tp_repr */
2688 0, /* tp_as_number */
2689 0, /* tp_as_sequence */
Eric V. Smith605bdae2016-09-11 08:55:43 -04002690 &match_as_mapping, /* tp_as_mapping */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002691 0, /* tp_hash */
2692 0, /* tp_call */
2693 0, /* tp_str */
2694 0, /* tp_getattro */
2695 0, /* tp_setattro */
2696 0, /* tp_as_buffer */
2697 Py_TPFLAGS_DEFAULT, /* tp_flags */
2698 match_doc, /* tp_doc */
2699 0, /* tp_traverse */
2700 0, /* tp_clear */
2701 0, /* tp_richcompare */
2702 0, /* tp_weaklistoffset */
2703 0, /* tp_iter */
2704 0, /* tp_iternext */
2705 match_methods, /* tp_methods */
2706 match_members, /* tp_members */
2707 match_getset, /* tp_getset */
2708};
2709
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002710static PyMethodDef scanner_methods[] = {
2711 _SRE_SRE_SCANNER_MATCH_METHODDEF
2712 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2713 {NULL, NULL}
2714};
2715
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002716#define SCAN_OFF(x) offsetof(ScannerObject, x)
2717static PyMemberDef scanner_members[] = {
2718 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2719 {NULL} /* Sentinel */
2720};
2721
2722static PyTypeObject Scanner_Type = {
2723 PyVarObject_HEAD_INIT(NULL, 0)
2724 "_" SRE_MODULE ".SRE_Scanner",
2725 sizeof(ScannerObject), 0,
2726 (destructor)scanner_dealloc,/* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002727 0, /* tp_vectorcall_offset */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002728 0, /* tp_getattr */
2729 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002730 0, /* tp_as_async */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002731 0, /* tp_repr */
2732 0, /* tp_as_number */
2733 0, /* tp_as_sequence */
2734 0, /* tp_as_mapping */
2735 0, /* tp_hash */
2736 0, /* tp_call */
2737 0, /* tp_str */
2738 0, /* tp_getattro */
2739 0, /* tp_setattro */
2740 0, /* tp_as_buffer */
2741 Py_TPFLAGS_DEFAULT, /* tp_flags */
2742 0, /* tp_doc */
2743 0, /* tp_traverse */
2744 0, /* tp_clear */
2745 0, /* tp_richcompare */
2746 0, /* tp_weaklistoffset */
2747 0, /* tp_iter */
2748 0, /* tp_iternext */
2749 scanner_methods, /* tp_methods */
2750 scanner_members, /* tp_members */
2751 0, /* tp_getset */
2752};
2753
Guido van Rossumb700df92000-03-31 14:59:30 +00002754static PyMethodDef _functions[] = {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002755 _SRE_COMPILE_METHODDEF
2756 _SRE_GETCODESIZE_METHODDEF
Serhiy Storchaka6d336a02017-05-09 23:37:14 +03002757 _SRE_ASCII_ISCASED_METHODDEF
2758 _SRE_UNICODE_ISCASED_METHODDEF
Serhiy Storchaka7186cc22017-05-05 10:42:46 +03002759 _SRE_ASCII_TOLOWER_METHODDEF
2760 _SRE_UNICODE_TOLOWER_METHODDEF
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002761 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002762};
2763
Martin v. Löwis1a214512008-06-11 05:26:20 +00002764static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002765 PyModuleDef_HEAD_INIT,
2766 "_" SRE_MODULE,
2767 NULL,
2768 -1,
2769 _functions,
2770 NULL,
2771 NULL,
2772 NULL,
2773 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00002774};
2775
2776PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002777{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002778 PyObject* m;
2779 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002780 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002781
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00002782 /* Patch object types */
2783 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
2784 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00002785 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002786
Martin v. Löwis1a214512008-06-11 05:26:20 +00002787 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00002788 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002789 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002790 d = PyModule_GetDict(m);
2791
Christian Heimes217cfd12007-12-02 14:31:20 +00002792 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002793 if (x) {
2794 PyDict_SetItemString(d, "MAGIC", x);
2795 Py_DECREF(x);
2796 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002797
Christian Heimes217cfd12007-12-02 14:31:20 +00002798 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00002799 if (x) {
2800 PyDict_SetItemString(d, "CODESIZE", x);
2801 Py_DECREF(x);
2802 }
2803
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02002804 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
2805 if (x) {
2806 PyDict_SetItemString(d, "MAXREPEAT", x);
2807 Py_DECREF(x);
2808 }
2809
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03002810 x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
2811 if (x) {
2812 PyDict_SetItemString(d, "MAXGROUPS", x);
2813 Py_DECREF(x);
2814 }
2815
Neal Norwitzfe537132007-08-26 03:55:15 +00002816 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002817 if (x) {
2818 PyDict_SetItemString(d, "copyright", x);
2819 Py_DECREF(x);
2820 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00002821 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00002822}
2823
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002824/* vim:ts=4:sw=4:et
2825*/