blob: 70bd8baa01e204e4e20fb52fdd0caaeb7d6a704c [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020038static const char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Victor Stinner4a21e572020-04-15 02:35:41 +020044#include "structmember.h" // PyMemberDef
Guido van Rossumb700df92000-03-31 14:59:30 +000045
46#include "sre.h"
47
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030048#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
49
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000050#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000051
Fredrik Lundh436c3d582000-06-29 08:58:44 +000052/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000053#if !defined(SRE_MODULE)
54#define SRE_MODULE "sre"
55#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000056
Thomas Wouters9ada3d62006-04-21 09:47:09 +000057#define SRE_PY_MODULE "re"
58
Guido van Rossumb700df92000-03-31 14:59:30 +000059/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000060#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000061
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062/* -------------------------------------------------------------------- */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000063
Fredrik Lundh80946112000-06-29 18:03:25 +000064#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000065#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000066#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000067/* fastest possible local call under MSVC */
68#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000069#else
Benjamin Peterson791dc832017-04-20 23:52:19 -070070#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000071#endif
72
73/* error codes */
74#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000075#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000076#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000077#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000078#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000079
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000080#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000081#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000082#else
83#define TRACE(v)
84#endif
85
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000086/* -------------------------------------------------------------------- */
87/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000088
Fredrik Lundh436c3d582000-06-29 08:58:44 +000089#define SRE_IS_DIGIT(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050090 ((ch) <= '9' && Py_ISDIGIT(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000091#define SRE_IS_SPACE(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050092 ((ch) <= ' ' && Py_ISSPACE(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000093#define SRE_IS_LINEBREAK(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +030094 ((ch) == '\n')
Fredrik Lundh436c3d582000-06-29 08:58:44 +000095#define SRE_IS_WORD(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050096 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
Guido van Rossumb700df92000-03-31 14:59:30 +000097
Serhiy Storchaka3557b052017-10-24 23:31:42 +030098static unsigned int sre_lower_ascii(unsigned int ch)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000099{
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300100 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000101}
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000104/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
105 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000106#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000107#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
108
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000109static unsigned int sre_lower_locale(unsigned int ch)
110{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000111 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000112}
113
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200114static unsigned int sre_upper_locale(unsigned int ch)
115{
116 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
117}
118
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000119/* unicode-specific character predicates */
120
Victor Stinner0058b862011-09-29 03:27:47 +0200121#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
122#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
123#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
124#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
125#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000126
127static unsigned int sre_lower_unicode(unsigned int ch)
128{
Victor Stinner0058b862011-09-29 03:27:47 +0200129 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000130}
131
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200132static unsigned int sre_upper_unicode(unsigned int ch)
133{
134 return (unsigned int) Py_UNICODE_TOUPPER(ch);
135}
136
Guido van Rossumb700df92000-03-31 14:59:30 +0000137LOCAL(int)
138sre_category(SRE_CODE category, unsigned int ch)
139{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000140 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000141
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000142 case SRE_CATEGORY_DIGIT:
143 return SRE_IS_DIGIT(ch);
144 case SRE_CATEGORY_NOT_DIGIT:
145 return !SRE_IS_DIGIT(ch);
146 case SRE_CATEGORY_SPACE:
147 return SRE_IS_SPACE(ch);
148 case SRE_CATEGORY_NOT_SPACE:
149 return !SRE_IS_SPACE(ch);
150 case SRE_CATEGORY_WORD:
151 return SRE_IS_WORD(ch);
152 case SRE_CATEGORY_NOT_WORD:
153 return !SRE_IS_WORD(ch);
154 case SRE_CATEGORY_LINEBREAK:
155 return SRE_IS_LINEBREAK(ch);
156 case SRE_CATEGORY_NOT_LINEBREAK:
157 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000158
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000159 case SRE_CATEGORY_LOC_WORD:
160 return SRE_LOC_IS_WORD(ch);
161 case SRE_CATEGORY_LOC_NOT_WORD:
162 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000163
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000164 case SRE_CATEGORY_UNI_DIGIT:
165 return SRE_UNI_IS_DIGIT(ch);
166 case SRE_CATEGORY_UNI_NOT_DIGIT:
167 return !SRE_UNI_IS_DIGIT(ch);
168 case SRE_CATEGORY_UNI_SPACE:
169 return SRE_UNI_IS_SPACE(ch);
170 case SRE_CATEGORY_UNI_NOT_SPACE:
171 return !SRE_UNI_IS_SPACE(ch);
172 case SRE_CATEGORY_UNI_WORD:
173 return SRE_UNI_IS_WORD(ch);
174 case SRE_CATEGORY_UNI_NOT_WORD:
175 return !SRE_UNI_IS_WORD(ch);
176 case SRE_CATEGORY_UNI_LINEBREAK:
177 return SRE_UNI_IS_LINEBREAK(ch);
178 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
179 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000180 }
181 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000182}
183
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300184LOCAL(int)
185char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
186{
187 return ch == pattern
188 || (SRE_CODE) sre_lower_locale(ch) == pattern
189 || (SRE_CODE) sre_upper_locale(ch) == pattern;
190}
191
192
Guido van Rossumb700df92000-03-31 14:59:30 +0000193/* helpers */
194
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000195static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000196data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000197{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000198 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000200 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000201 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000202 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000203}
204
205static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000206data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000207{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000208 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000209 minsize = state->data_stack_base+size;
210 cursize = state->data_stack_size;
211 if (cursize < minsize) {
212 void* stack;
213 cursize = minsize+minsize/4+1024;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200214 TRACE(("allocate/grow stack %zd\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000216 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000217 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000218 return SRE_ERROR_MEMORY;
219 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000220 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000221 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000222 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000223 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000224}
225
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000226/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000227
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300228#define SRE_CHAR Py_UCS1
229#define SIZEOF_SRE_CHAR 1
230#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300231#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000232
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300233/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000234
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300235#define SRE_CHAR Py_UCS2
236#define SIZEOF_SRE_CHAR 2
237#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300238#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000239
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300240/* generate 32-bit unicode version */
241
242#define SRE_CHAR Py_UCS4
243#define SIZEOF_SRE_CHAR 4
244#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300245#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000246
247/* -------------------------------------------------------------------- */
248/* factories and destructors */
249
250/* see sre.h for object declarations */
Victor Stinnerf5587782013-11-15 23:21:11 +0100251static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, Py_ssize_t);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300252static PyObject *pattern_scanner(PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
Guido van Rossumb700df92000-03-31 14:59:30 +0000253
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300254
255/*[clinic input]
256module _sre
257class _sre.SRE_Pattern "PatternObject *" "&Pattern_Type"
258class _sre.SRE_Match "MatchObject *" "&Match_Type"
259class _sre.SRE_Scanner "ScannerObject *" "&Scanner_Type"
260[clinic start generated code]*/
261/*[clinic end generated code: output=da39a3ee5e6b4b0d input=b0230ec19a0deac8]*/
262
Larry Hastings2d0a69a2015-05-03 14:49:19 -0700263static PyTypeObject Pattern_Type;
264static PyTypeObject Match_Type;
265static PyTypeObject Scanner_Type;
266
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300267/*[clinic input]
268_sre.getcodesize -> int
269[clinic start generated code]*/
270
271static int
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +0300272_sre_getcodesize_impl(PyObject *module)
273/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000274{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300275 return sizeof(SRE_CODE);
Guido van Rossumb700df92000-03-31 14:59:30 +0000276}
277
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300278/*[clinic input]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300279_sre.ascii_iscased -> bool
280
281 character: int
282 /
283
284[clinic start generated code]*/
285
286static int
287_sre_ascii_iscased_impl(PyObject *module, int character)
288/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
289{
290 unsigned int ch = (unsigned int)character;
Sergey Fedoseev7f0d59f2018-09-12 17:49:09 +0500291 return ch < 128 && Py_ISALPHA(ch);
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300292}
293
294/*[clinic input]
295_sre.unicode_iscased -> bool
296
297 character: int
298 /
299
300[clinic start generated code]*/
301
302static int
303_sre_unicode_iscased_impl(PyObject *module, int character)
304/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
305{
306 unsigned int ch = (unsigned int)character;
307 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
308}
309
310/*[clinic input]
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300311_sre.ascii_tolower -> int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300312
313 character: int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300314 /
315
316[clinic start generated code]*/
317
318static int
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300319_sre_ascii_tolower_impl(PyObject *module, int character)
320/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000321{
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300322 return sre_lower_ascii(character);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000323}
324
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300325/*[clinic input]
326_sre.unicode_tolower -> int
327
328 character: int
329 /
330
331[clinic start generated code]*/
332
333static int
334_sre_unicode_tolower_impl(PyObject *module, int character)
335/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
336{
337 return sre_lower_unicode(character);
338}
339
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000340LOCAL(void)
341state_reset(SRE_STATE* state)
342{
animalize4a7f44a2019-02-18 21:26:37 +0800343 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000344 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000345
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000346 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000347 state->lastindex = -1;
348
349 state->repeat = NULL;
350
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000351 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000352}
353
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300354static const void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300356 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600357 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000358{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000359 /* given a python object, return a data pointer, a length (in
360 characters), and a character size. return NULL if the object
361 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000362
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000363 /* Unicode objects do not support the buffer API. So, get the data
364 directly instead. */
365 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 if (PyUnicode_READY(string) == -1)
367 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200369 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300370 *p_isbytes = 0;
371 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000372 }
373
Victor Stinner0058b862011-09-29 03:27:47 +0200374 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300375 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200376 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300377 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000378 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000379
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300380 *p_length = view->len;
381 *p_charsize = 1;
382 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000383
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300384 if (view->buf == NULL) {
385 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
386 PyBuffer_Release(view);
387 view->buf = NULL;
388 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000389 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300390 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000391}
392
393LOCAL(PyObject*)
394state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000395 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000396{
397 /* prepare state object */
398
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000399 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300400 int isbytes, charsize;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300401 const void* ptr;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000402
403 memset(state, 0, sizeof(SRE_STATE));
404
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300405 state->mark = PyMem_New(const void *, pattern->groups * 2);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300406 if (!state->mark) {
407 PyErr_NoMemory();
408 goto err;
409 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000410 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000411 state->lastindex = -1;
412
Benjamin Petersone48944b2012-03-07 14:50:25 -0600413 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300414 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000415 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600416 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000417
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300418 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600419 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200420 "cannot use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600421 goto err;
422 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300423 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600424 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200425 "cannot use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600426 goto err;
427 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000428
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000429 /* adjust boundaries */
430 if (start < 0)
431 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000432 else if (start > length)
433 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000434
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000435 if (end < 0)
436 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000437 else if (end > length)
438 end = length;
439
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300440 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000441 state->charsize = charsize;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200442 state->match_all = 0;
443 state->must_advance = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000444
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000446
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000447 state->start = (void*) ((char*) ptr + start * state->charsize);
448 state->end = (void*) ((char*) ptr + end * state->charsize);
449
450 Py_INCREF(string);
451 state->string = string;
452 state->pos = start;
453 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000454
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000455 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600456 err:
Ammar Askar06e3a272020-06-01 17:21:43 +0000457 /* We add an explicit cast here because MSVC has a bug when
458 compiling C code where it believes that `const void**` cannot be
459 safely casted to `void*`, see bpo-39943 for details. */
460 PyMem_Del((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300461 state->mark = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600462 if (state->buffer.buf)
463 PyBuffer_Release(&state->buffer);
464 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000465}
466
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000467LOCAL(void)
468state_fini(SRE_STATE* state)
469{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600470 if (state->buffer.buf)
471 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000472 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000473 data_stack_dealloc(state);
Ammar Askar06e3a272020-06-01 17:21:43 +0000474 /* See above PyMem_Del for why we explicitly cast here. */
475 PyMem_Del((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300476 state->mark = NULL;
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000477}
478
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000479/* calculate offset from start of string */
480#define STATE_OFFSET(state, member)\
481 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
482
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000483LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300484getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300485 PyObject* string, Py_ssize_t start, Py_ssize_t end)
486{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300487 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300488 if (PyBytes_CheckExact(string) &&
489 start == 0 && end == PyBytes_GET_SIZE(string)) {
490 Py_INCREF(string);
491 return string;
492 }
493 return PyBytes_FromStringAndSize(
494 (const char *)ptr + start, end - start);
495 }
496 else {
497 return PyUnicode_Substring(string, start, end);
498 }
499}
500
501LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000502state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000503{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000504 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000505
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000506 index = (index - 1) * 2;
507
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000508 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000509 if (empty)
510 /* want empty string */
511 i = j = 0;
512 else {
Serhiy Storchaka228b12e2017-01-23 09:47:21 +0200513 Py_RETURN_NONE;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000514 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000515 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000516 i = STATE_OFFSET(state, state->mark[index]);
517 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000518 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000519
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300520 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000521}
522
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000523static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100524pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000525{
526 switch (status) {
527 case SRE_ERROR_RECURSION_LIMIT:
Yury Selivanovf488fb42015-07-03 01:04:23 -0400528 /* This error code seems to be unused. */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000529 PyErr_SetString(
Yury Selivanovf488fb42015-07-03 01:04:23 -0400530 PyExc_RecursionError,
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000531 "maximum recursion limit exceeded"
532 );
533 break;
534 case SRE_ERROR_MEMORY:
535 PyErr_NoMemory();
536 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000537 case SRE_ERROR_INTERRUPTED:
538 /* An exception has already been raised, so let it fly */
539 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000540 default:
541 /* other error codes indicate compiler/engine bugs */
542 PyErr_SetString(
543 PyExc_RuntimeError,
544 "internal error in regular expression engine"
545 );
546 }
547}
548
Guido van Rossumb700df92000-03-31 14:59:30 +0000549static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000550pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000551{
Raymond Hettinger027bb632004-05-31 03:09:25 +0000552 if (self->weakreflist != NULL)
553 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000554 Py_XDECREF(self->pattern);
555 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000556 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000557 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +0000558}
559
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300560LOCAL(Py_ssize_t)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200561sre_match(SRE_STATE* state, SRE_CODE* pattern)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300562{
563 if (state->charsize == 1)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200564 return sre_ucs1_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300565 if (state->charsize == 2)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200566 return sre_ucs2_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300567 assert(state->charsize == 4);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200568 return sre_ucs4_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300569}
570
571LOCAL(Py_ssize_t)
572sre_search(SRE_STATE* state, SRE_CODE* pattern)
573{
574 if (state->charsize == 1)
575 return sre_ucs1_search(state, pattern);
576 if (state->charsize == 2)
577 return sre_ucs2_search(state, pattern);
578 assert(state->charsize == 4);
579 return sre_ucs4_search(state, pattern);
580}
581
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300582/*[clinic input]
583_sre.SRE_Pattern.match
584
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200585 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300586 pos: Py_ssize_t = 0
587 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300588
589Matches zero or more characters at the beginning of the string.
590[clinic start generated code]*/
591
Larry Hastings16c51912014-01-07 11:53:01 -0800592static PyObject *
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300593_sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200594 Py_ssize_t pos, Py_ssize_t endpos)
595/*[clinic end generated code: output=ea2d838888510661 input=a2ba191647abebe5]*/
Larry Hastings16c51912014-01-07 11:53:01 -0800596{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000597 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100598 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300599 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000600
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300601 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000602 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000603
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000604 state.ptr = state.start;
605
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000606 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
607
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200608 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000609
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000610 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300611 if (PyErr_Occurred()) {
612 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000613 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300614 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000615
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300616 match = pattern_new_match(self, &state, status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000617 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300618 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000619}
620
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300621/*[clinic input]
622_sre.SRE_Pattern.fullmatch
623
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200624 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300625 pos: Py_ssize_t = 0
626 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300627
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300628Matches against all of the string.
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300629[clinic start generated code]*/
630
631static PyObject *
632_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200633 Py_ssize_t pos, Py_ssize_t endpos)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300634/*[clinic end generated code: output=5833c47782a35f4a input=d9fb03a7625b5828]*/
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200635{
636 SRE_STATE state;
637 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300638 PyObject *match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200639
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300640 if (!state_init(&state, self, string, pos, endpos))
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200641 return NULL;
642
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200643 state.ptr = state.start;
644
645 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
646
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200647 state.match_all = 1;
648 status = sre_match(&state, PatternObject_GetCode(self));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200649
650 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300651 if (PyErr_Occurred()) {
652 state_fini(&state);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200653 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300654 }
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200655
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300656 match = pattern_new_match(self, &state, status);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200657 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300658 return match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200659}
660
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300661/*[clinic input]
662_sre.SRE_Pattern.search
663
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200664 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300665 pos: Py_ssize_t = 0
666 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300667
668Scan through string looking for a match, and return a corresponding match object instance.
669
670Return None if no position in the string matches.
671[clinic start generated code]*/
672
673static PyObject *
674_sre_SRE_Pattern_search_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200675 Py_ssize_t pos, Py_ssize_t endpos)
676/*[clinic end generated code: output=25f302a644e951e8 input=4ae5cb7dc38fed1b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000677{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000678 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100679 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300680 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000681
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300682 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000683 return NULL;
684
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000685 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
686
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300687 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000688
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000689 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
690
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300691 if (PyErr_Occurred()) {
692 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000693 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300694 }
Thomas Wouters89f507f2006-12-13 04:49:30 +0000695
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300696 match = pattern_new_match(self, &state, status);
697 state_fini(&state);
698 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000699}
700
701static PyObject*
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200702call(const char* module, const char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000703{
704 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000705 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000706 PyObject* func;
707 PyObject* result;
708
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000709 if (!args)
710 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000711 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000712 if (!name)
713 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000714 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000715 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000716 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000717 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000718 func = PyObject_GetAttrString(mod, function);
719 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000720 if (!func)
721 return NULL;
722 result = PyObject_CallObject(func, args);
723 Py_DECREF(func);
724 Py_DECREF(args);
725 return result;
726}
727
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300728/*[clinic input]
729_sre.SRE_Pattern.findall
730
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200731 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300732 pos: Py_ssize_t = 0
733 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300734
735Return a list of all non-overlapping matches of pattern in string.
736[clinic start generated code]*/
737
738static PyObject *
739_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200740 Py_ssize_t pos, Py_ssize_t endpos)
741/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000742{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000743 SRE_STATE state;
744 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100745 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000746 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000747
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300748 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000749 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000750
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000751 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000752 if (!list) {
753 state_fini(&state);
754 return NULL;
755 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000756
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000757 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000759 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000760
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000761 state_reset(&state);
762
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000763 state.ptr = state.start;
764
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300765 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300766 if (PyErr_Occurred())
767 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000768
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000769 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000770 if (status == 0)
771 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000772 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000773 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000774 }
Tim Peters3d563502006-01-21 02:47:53 +0000775
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000776 /* don't bother to build a match object */
777 switch (self->groups) {
778 case 0:
779 b = STATE_OFFSET(&state, state.start);
780 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300781 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300782 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000783 if (!item)
784 goto error;
785 break;
786 case 1:
787 item = state_getslice(&state, 1, string, 1);
788 if (!item)
789 goto error;
790 break;
791 default:
792 item = PyTuple_New(self->groups);
793 if (!item)
794 goto error;
795 for (i = 0; i < self->groups; i++) {
796 PyObject* o = state_getslice(&state, i+1, string, 1);
797 if (!o) {
798 Py_DECREF(item);
799 goto error;
800 }
801 PyTuple_SET_ITEM(item, i, o);
802 }
803 break;
804 }
805
806 status = PyList_Append(list, item);
807 Py_DECREF(item);
808 if (status < 0)
809 goto error;
810
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200811 state.must_advance = (state.ptr == state.start);
812 state.start = state.ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000815 state_fini(&state);
816 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000817
818error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000819 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000820 state_fini(&state);
821 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000822
Guido van Rossumb700df92000-03-31 14:59:30 +0000823}
824
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300825/*[clinic input]
826_sre.SRE_Pattern.finditer
827
828 string: object
829 pos: Py_ssize_t = 0
830 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
831
832Return an iterator over all non-overlapping matches for the RE pattern in string.
833
834For each match, the iterator returns a match object.
835[clinic start generated code]*/
836
837static PyObject *
838_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyObject *string,
839 Py_ssize_t pos, Py_ssize_t endpos)
840/*[clinic end generated code: output=0bbb1a0aeb38bb14 input=612aab69e9fe08e4]*/
Fredrik Lundh703ce812001-10-24 22:16:30 +0000841{
842 PyObject* scanner;
843 PyObject* search;
844 PyObject* iterator;
845
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300846 scanner = pattern_scanner(self, string, pos, endpos);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000847 if (!scanner)
848 return NULL;
849
850 search = PyObject_GetAttrString(scanner, "search");
851 Py_DECREF(scanner);
852 if (!search)
853 return NULL;
854
855 iterator = PyCallIter_New(search, Py_None);
856 Py_DECREF(search);
857
858 return iterator;
859}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000860
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300861/*[clinic input]
862_sre.SRE_Pattern.scanner
863
864 string: object
865 pos: Py_ssize_t = 0
866 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
867
868[clinic start generated code]*/
869
870static PyObject *
871_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyObject *string,
872 Py_ssize_t pos, Py_ssize_t endpos)
873/*[clinic end generated code: output=54ea548aed33890b input=3aacdbde77a3a637]*/
874{
875 return pattern_scanner(self, string, pos, endpos);
876}
877
878/*[clinic input]
879_sre.SRE_Pattern.split
880
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200881 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300882 maxsplit: Py_ssize_t = 0
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300883
884Split string by the occurrences of pattern.
885[clinic start generated code]*/
886
887static PyObject *
888_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200889 Py_ssize_t maxsplit)
890/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000891{
892 SRE_STATE state;
893 PyObject* list;
894 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100895 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000896 Py_ssize_t n;
897 Py_ssize_t i;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300898 const void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000899
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200900 assert(self->codesize != 0);
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200901
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300902 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000903 return NULL;
904
905 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000906 if (!list) {
907 state_fini(&state);
908 return NULL;
909 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000910
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000911 n = 0;
912 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000913
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000914 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000915
916 state_reset(&state);
917
918 state.ptr = state.start;
919
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300920 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300921 if (PyErr_Occurred())
922 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000923
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000924 if (status <= 0) {
925 if (status == 0)
926 break;
927 pattern_error(status);
928 goto error;
929 }
Tim Peters3d563502006-01-21 02:47:53 +0000930
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000931 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300932 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000933 string, STATE_OFFSET(&state, last),
934 STATE_OFFSET(&state, state.start)
935 );
936 if (!item)
937 goto error;
938 status = PyList_Append(list, item);
939 Py_DECREF(item);
940 if (status < 0)
941 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000942
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000943 /* add groups (if any) */
944 for (i = 0; i < self->groups; i++) {
945 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000946 if (!item)
947 goto error;
948 status = PyList_Append(list, item);
949 Py_DECREF(item);
950 if (status < 0)
951 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000952 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000953
954 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200955 state.must_advance = (state.ptr == state.start);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000956 last = state.start = state.ptr;
957
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000958 }
959
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000960 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300961 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000962 string, STATE_OFFSET(&state, last), state.endpos
963 );
964 if (!item)
965 goto error;
966 status = PyList_Append(list, item);
967 Py_DECREF(item);
968 if (status < 0)
969 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000970
971 state_fini(&state);
972 return list;
973
974error:
975 Py_DECREF(list);
976 state_fini(&state);
977 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000978
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000979}
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000980
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000981static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000982pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000983 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000984{
985 SRE_STATE state;
986 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +0300987 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000988 PyObject* item;
989 PyObject* filter;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000990 PyObject* match;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300991 const void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100992 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000993 Py_ssize_t n;
994 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300995 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000996 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600997 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000998
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000999 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001000 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001001 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001002 Py_INCREF(filter);
1003 filter_is_callable = 1;
1004 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001005 /* if not callable, check if it's a literal string */
1006 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001007 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001008 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001009 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001010 if (charsize == 1)
1011 literal = memchr(ptr, '\\', n) == NULL;
1012 else
1013 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001014 } else {
1015 PyErr_Clear();
1016 literal = 0;
1017 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06001018 if (view.buf)
1019 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001020 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001021 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001022 Py_INCREF(filter);
1023 filter_is_callable = 0;
1024 } else {
1025 /* not a literal; hand it over to the template compiler */
1026 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001027 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001028 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001029 );
1030 if (!filter)
1031 return NULL;
1032 filter_is_callable = PyCallable_Check(filter);
1033 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00001034 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001035
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001036 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001037 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001038 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001039 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001040
1041 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001042 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001043 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001044 state_fini(&state);
1045 return NULL;
1046 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001047
1048 n = i = 0;
1049
1050 while (!count || n < count) {
1051
1052 state_reset(&state);
1053
1054 state.ptr = state.start;
1055
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001056 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001057 if (PyErr_Occurred())
1058 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001059
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001060 if (status <= 0) {
1061 if (status == 0)
1062 break;
1063 pattern_error(status);
1064 goto error;
1065 }
Tim Peters3d563502006-01-21 02:47:53 +00001066
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001067 b = STATE_OFFSET(&state, state.start);
1068 e = STATE_OFFSET(&state, state.ptr);
1069
1070 if (i < b) {
1071 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001072 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001073 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001074 if (!item)
1075 goto error;
1076 status = PyList_Append(list, item);
1077 Py_DECREF(item);
1078 if (status < 0)
1079 goto error;
1080
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001081 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001082
1083 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001084 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001085 match = pattern_new_match(self, &state, 1);
1086 if (!match)
1087 goto error;
Petr Viktorinffd97532020-02-11 17:46:57 +01001088 item = PyObject_CallOneArg(filter, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001089 Py_DECREF(match);
1090 if (!item)
1091 goto error;
1092 } else {
1093 /* filter is literal string */
1094 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001095 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001096 }
1097
1098 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001099 if (item != Py_None) {
1100 status = PyList_Append(list, item);
1101 Py_DECREF(item);
1102 if (status < 0)
1103 goto error;
1104 }
Tim Peters3d563502006-01-21 02:47:53 +00001105
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001106 i = e;
1107 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001108 state.must_advance = (state.ptr == state.start);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001109 state.start = state.ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001110 }
1111
1112 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001113 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001114 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001115 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001116 if (!item)
1117 goto error;
1118 status = PyList_Append(list, item);
1119 Py_DECREF(item);
1120 if (status < 0)
1121 goto error;
1122 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001123
1124 state_fini(&state);
1125
Guido van Rossum4e173842001-12-07 04:25:10 +00001126 Py_DECREF(filter);
1127
Fredrik Lundhdac58492001-10-21 21:48:30 +00001128 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001129 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001130 if (!joiner) {
1131 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001132 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001133 }
1134 if (PyList_GET_SIZE(list) == 0) {
1135 Py_DECREF(list);
1136 item = joiner;
1137 }
1138 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001139 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001140 item = _PyBytes_Join(joiner, list);
1141 else
1142 item = PyUnicode_Join(joiner, list);
1143 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001144 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001145 if (!item)
1146 return NULL;
1147 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001148
1149 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001150 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001151
1152 return item;
1153
1154error:
1155 Py_DECREF(list);
1156 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001157 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001158 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001159
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001160}
1161
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001162/*[clinic input]
1163_sre.SRE_Pattern.sub
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001164
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001165 repl: object
1166 string: object
1167 count: Py_ssize_t = 0
1168
1169Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1170[clinic start generated code]*/
1171
1172static PyObject *
1173_sre_SRE_Pattern_sub_impl(PatternObject *self, PyObject *repl,
1174 PyObject *string, Py_ssize_t count)
1175/*[clinic end generated code: output=1dbf2ec3479cba00 input=c53d70be0b3caf86]*/
1176{
1177 return pattern_subx(self, repl, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001178}
1179
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001180/*[clinic input]
1181_sre.SRE_Pattern.subn
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001182
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001183 repl: object
1184 string: object
1185 count: Py_ssize_t = 0
1186
1187Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1188[clinic start generated code]*/
1189
1190static PyObject *
1191_sre_SRE_Pattern_subn_impl(PatternObject *self, PyObject *repl,
1192 PyObject *string, Py_ssize_t count)
1193/*[clinic end generated code: output=0d9522cd529e9728 input=e7342d7ce6083577]*/
1194{
1195 return pattern_subx(self, repl, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001196}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001197
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001198/*[clinic input]
1199_sre.SRE_Pattern.__copy__
1200
1201[clinic start generated code]*/
1202
1203static PyObject *
1204_sre_SRE_Pattern___copy___impl(PatternObject *self)
1205/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001206{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001207 Py_INCREF(self);
1208 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001209}
1210
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001211/*[clinic input]
1212_sre.SRE_Pattern.__deepcopy__
1213
1214 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001215 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001216
1217[clinic start generated code]*/
1218
1219static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001220_sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1221/*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001222{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001223 Py_INCREF(self);
1224 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001225}
1226
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001227static PyObject *
1228pattern_repr(PatternObject *obj)
1229{
1230 static const struct {
1231 const char *name;
1232 int value;
1233 } flag_names[] = {
1234 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1235 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1236 {"re.LOCALE", SRE_FLAG_LOCALE},
1237 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1238 {"re.DOTALL", SRE_FLAG_DOTALL},
1239 {"re.UNICODE", SRE_FLAG_UNICODE},
1240 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1241 {"re.DEBUG", SRE_FLAG_DEBUG},
1242 {"re.ASCII", SRE_FLAG_ASCII},
1243 };
1244 PyObject *result = NULL;
1245 PyObject *flag_items;
Victor Stinner706768c2014-08-16 01:03:39 +02001246 size_t i;
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001247 int flags = obj->flags;
1248
1249 /* Omit re.UNICODE for valid string patterns. */
1250 if (obj->isbytes == 0 &&
1251 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1252 SRE_FLAG_UNICODE)
1253 flags &= ~SRE_FLAG_UNICODE;
1254
1255 flag_items = PyList_New(0);
1256 if (!flag_items)
1257 return NULL;
1258
1259 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1260 if (flags & flag_names[i].value) {
1261 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1262 if (!item)
1263 goto done;
1264
1265 if (PyList_Append(flag_items, item) < 0) {
1266 Py_DECREF(item);
1267 goto done;
1268 }
1269 Py_DECREF(item);
1270 flags &= ~flag_names[i].value;
1271 }
1272 }
1273 if (flags) {
1274 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1275 if (!item)
1276 goto done;
1277
1278 if (PyList_Append(flag_items, item) < 0) {
1279 Py_DECREF(item);
1280 goto done;
1281 }
1282 Py_DECREF(item);
1283 }
1284
1285 if (PyList_Size(flag_items) > 0) {
1286 PyObject *flags_result;
1287 PyObject *sep = PyUnicode_FromString("|");
1288 if (!sep)
1289 goto done;
1290 flags_result = PyUnicode_Join(sep, flag_items);
1291 Py_DECREF(sep);
1292 if (!flags_result)
1293 goto done;
1294 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1295 obj->pattern, flags_result);
1296 Py_DECREF(flags_result);
1297 }
1298 else {
1299 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1300 }
1301
1302done:
1303 Py_DECREF(flag_items);
1304 return result;
1305}
1306
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001307PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001308
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001309/* PatternObject's 'groupindex' method. */
1310static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02001311pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001312{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001313 if (self->groupindex == NULL)
1314 return PyDict_New();
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001315 return PyDictProxy_New(self->groupindex);
1316}
1317
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001318static int _validate(PatternObject *self); /* Forward */
1319
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001320/*[clinic input]
1321_sre.compile
1322
1323 pattern: object
1324 flags: int
1325 code: object(subclass_of='&PyList_Type')
1326 groups: Py_ssize_t
Victor Stinner726a57d2016-11-22 23:04:39 +01001327 groupindex: object(subclass_of='&PyDict_Type')
1328 indexgroup: object(subclass_of='&PyTuple_Type')
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001329
1330[clinic start generated code]*/
1331
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001332static PyObject *
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +03001333_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001334 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1335 PyObject *indexgroup)
Victor Stinner726a57d2016-11-22 23:04:39 +01001336/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001337{
1338 /* "compile" pattern descriptor to pattern object */
1339
1340 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001341 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001342
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001343 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001344 /* coverity[ampersand_in_size] */
Victor Stinner92055202020-04-08 00:38:15 +02001345 self = PyObject_NewVar(PatternObject, &Pattern_Type, n);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001346 if (!self)
1347 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001348 self->weakreflist = NULL;
1349 self->pattern = NULL;
1350 self->groupindex = NULL;
1351 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001352
1353 self->codesize = n;
1354
1355 for (i = 0; i < n; i++) {
1356 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001357 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001358 self->code[i] = (SRE_CODE) value;
1359 if ((unsigned long) self->code[i] != value) {
1360 PyErr_SetString(PyExc_OverflowError,
1361 "regular expression code size limit exceeded");
1362 break;
1363 }
1364 }
1365
1366 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001367 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001368 return NULL;
1369 }
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001372 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 else {
1375 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001376 int charsize;
1377 Py_buffer view;
1378 view.buf = NULL;
1379 if (!getstring(pattern, &p_length, &self->isbytes,
1380 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 Py_DECREF(self);
1382 return NULL;
1383 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001384 if (view.buf)
1385 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001387
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001388 Py_INCREF(pattern);
1389 self->pattern = pattern;
1390
1391 self->flags = flags;
1392
1393 self->groups = groups;
1394
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001395 if (PyDict_GET_SIZE(groupindex) > 0) {
1396 Py_INCREF(groupindex);
1397 self->groupindex = groupindex;
1398 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1399 Py_INCREF(indexgroup);
1400 self->indexgroup = indexgroup;
1401 }
1402 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001403
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001404 if (!_validate(self)) {
1405 Py_DECREF(self);
1406 return NULL;
1407 }
1408
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001409 return (PyObject*) self;
1410}
1411
Guido van Rossumb700df92000-03-31 14:59:30 +00001412/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001413/* Code validation */
1414
1415/* To learn more about this code, have a look at the _compile() function in
1416 Lib/sre_compile.py. The validation functions below checks the code array
1417 for conformance with the code patterns generated there.
1418
1419 The nice thing about the generated code is that it is position-independent:
1420 all jumps are relative jumps forward. Also, jumps don't cross each other:
1421 the target of a later jump is always earlier than the target of an earlier
1422 jump. IOW, this is okay:
1423
1424 J---------J-------T--------T
1425 \ \_____/ /
1426 \______________________/
1427
1428 but this is not:
1429
1430 J---------J-------T--------T
1431 \_________\_____/ /
1432 \____________/
1433
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001434 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001435*/
1436
1437/* Defining this one enables tracing of the validator */
1438#undef VVERBOSE
1439
1440/* Trace macro for the validator */
1441#if defined(VVERBOSE)
1442#define VTRACE(v) printf v
1443#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001444#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001445#endif
1446
1447/* Report failure */
1448#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1449
1450/* Extract opcode, argument, or skip count from code array */
1451#define GET_OP \
1452 do { \
1453 VTRACE(("%p: ", code)); \
1454 if (code >= end) FAIL; \
1455 op = *code++; \
1456 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1457 } while (0)
1458#define GET_ARG \
1459 do { \
1460 VTRACE(("%p= ", code)); \
1461 if (code >= end) FAIL; \
1462 arg = *code++; \
1463 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1464 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001465#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001466 do { \
1467 VTRACE(("%p= ", code)); \
1468 if (code >= end) FAIL; \
1469 skip = *code; \
1470 VTRACE(("%lu (skip to %p)\n", \
1471 (unsigned long)skip, code+skip)); \
Benjamin Petersonca470632016-09-06 13:47:26 -07001472 if (skip-adj > (uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001473 FAIL; \
1474 code++; \
1475 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001476#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001477
1478static int
1479_validate_charset(SRE_CODE *code, SRE_CODE *end)
1480{
1481 /* Some variables are manipulated by the macros above */
1482 SRE_CODE op;
1483 SRE_CODE arg;
1484 SRE_CODE offset;
1485 int i;
1486
1487 while (code < end) {
1488 GET_OP;
1489 switch (op) {
1490
1491 case SRE_OP_NEGATE:
1492 break;
1493
1494 case SRE_OP_LITERAL:
1495 GET_ARG;
1496 break;
1497
1498 case SRE_OP_RANGE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001499 case SRE_OP_RANGE_UNI_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001500 GET_ARG;
1501 GET_ARG;
1502 break;
1503
1504 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001505 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Benjamin Petersonca470632016-09-06 13:47:26 -07001506 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001507 FAIL;
1508 code += offset;
1509 break;
1510
1511 case SRE_OP_BIGCHARSET:
1512 GET_ARG; /* Number of blocks */
1513 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001514 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001515 FAIL;
1516 /* Make sure that each byte points to a valid block */
1517 for (i = 0; i < 256; i++) {
1518 if (((unsigned char *)code)[i] >= arg)
1519 FAIL;
1520 }
1521 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001522 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Benjamin Petersonca470632016-09-06 13:47:26 -07001523 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001524 FAIL;
1525 code += offset;
1526 break;
1527
1528 case SRE_OP_CATEGORY:
1529 GET_ARG;
1530 switch (arg) {
1531 case SRE_CATEGORY_DIGIT:
1532 case SRE_CATEGORY_NOT_DIGIT:
1533 case SRE_CATEGORY_SPACE:
1534 case SRE_CATEGORY_NOT_SPACE:
1535 case SRE_CATEGORY_WORD:
1536 case SRE_CATEGORY_NOT_WORD:
1537 case SRE_CATEGORY_LINEBREAK:
1538 case SRE_CATEGORY_NOT_LINEBREAK:
1539 case SRE_CATEGORY_LOC_WORD:
1540 case SRE_CATEGORY_LOC_NOT_WORD:
1541 case SRE_CATEGORY_UNI_DIGIT:
1542 case SRE_CATEGORY_UNI_NOT_DIGIT:
1543 case SRE_CATEGORY_UNI_SPACE:
1544 case SRE_CATEGORY_UNI_NOT_SPACE:
1545 case SRE_CATEGORY_UNI_WORD:
1546 case SRE_CATEGORY_UNI_NOT_WORD:
1547 case SRE_CATEGORY_UNI_LINEBREAK:
1548 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1549 break;
1550 default:
1551 FAIL;
1552 }
1553 break;
1554
1555 default:
1556 FAIL;
1557
1558 }
1559 }
1560
1561 return 1;
1562}
1563
1564static int
1565_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1566{
1567 /* Some variables are manipulated by the macros above */
1568 SRE_CODE op;
1569 SRE_CODE arg;
1570 SRE_CODE skip;
1571
1572 VTRACE(("code=%p, end=%p\n", code, end));
1573
1574 if (code > end)
1575 FAIL;
1576
1577 while (code < end) {
1578 GET_OP;
1579 switch (op) {
1580
1581 case SRE_OP_MARK:
1582 /* We don't check whether marks are properly nested; the
1583 sre_match() code is robust even if they don't, and the worst
1584 you can get is nonsensical match results. */
1585 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001586 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001587 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1588 FAIL;
1589 }
1590 break;
1591
1592 case SRE_OP_LITERAL:
1593 case SRE_OP_NOT_LITERAL:
1594 case SRE_OP_LITERAL_IGNORE:
1595 case SRE_OP_NOT_LITERAL_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001596 case SRE_OP_LITERAL_UNI_IGNORE:
1597 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001598 case SRE_OP_LITERAL_LOC_IGNORE:
1599 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001600 GET_ARG;
1601 /* The arg is just a character, nothing to check */
1602 break;
1603
1604 case SRE_OP_SUCCESS:
1605 case SRE_OP_FAILURE:
1606 /* Nothing to check; these normally end the matching process */
1607 break;
1608
1609 case SRE_OP_AT:
1610 GET_ARG;
1611 switch (arg) {
1612 case SRE_AT_BEGINNING:
1613 case SRE_AT_BEGINNING_STRING:
1614 case SRE_AT_BEGINNING_LINE:
1615 case SRE_AT_END:
1616 case SRE_AT_END_LINE:
1617 case SRE_AT_END_STRING:
1618 case SRE_AT_BOUNDARY:
1619 case SRE_AT_NON_BOUNDARY:
1620 case SRE_AT_LOC_BOUNDARY:
1621 case SRE_AT_LOC_NON_BOUNDARY:
1622 case SRE_AT_UNI_BOUNDARY:
1623 case SRE_AT_UNI_NON_BOUNDARY:
1624 break;
1625 default:
1626 FAIL;
1627 }
1628 break;
1629
1630 case SRE_OP_ANY:
1631 case SRE_OP_ANY_ALL:
1632 /* These have no operands */
1633 break;
1634
1635 case SRE_OP_IN:
1636 case SRE_OP_IN_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001637 case SRE_OP_IN_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001638 case SRE_OP_IN_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001639 GET_SKIP;
1640 /* Stop 1 before the end; we check the FAILURE below */
1641 if (!_validate_charset(code, code+skip-2))
1642 FAIL;
1643 if (code[skip-2] != SRE_OP_FAILURE)
1644 FAIL;
1645 code += skip-1;
1646 break;
1647
1648 case SRE_OP_INFO:
1649 {
1650 /* A minimal info field is
1651 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1652 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1653 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001654 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001655 SRE_CODE *newcode;
1656 GET_SKIP;
1657 newcode = code+skip-1;
1658 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001659 GET_ARG;
1660 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001661 /* Check that only valid flags are present */
1662 if ((flags & ~(SRE_INFO_PREFIX |
1663 SRE_INFO_LITERAL |
1664 SRE_INFO_CHARSET)) != 0)
1665 FAIL;
1666 /* PREFIX and CHARSET are mutually exclusive */
1667 if ((flags & SRE_INFO_PREFIX) &&
1668 (flags & SRE_INFO_CHARSET))
1669 FAIL;
1670 /* LITERAL implies PREFIX */
1671 if ((flags & SRE_INFO_LITERAL) &&
1672 !(flags & SRE_INFO_PREFIX))
1673 FAIL;
1674 /* Validate the prefix */
1675 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001676 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001677 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001678 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001679 /* Here comes the prefix string */
Benjamin Petersonca470632016-09-06 13:47:26 -07001680 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001681 FAIL;
1682 code += prefix_len;
1683 /* And here comes the overlap table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001684 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001685 FAIL;
1686 /* Each overlap value should be < prefix_len */
1687 for (i = 0; i < prefix_len; i++) {
1688 if (code[i] >= prefix_len)
1689 FAIL;
1690 }
1691 code += prefix_len;
1692 }
1693 /* Validate the charset */
1694 if (flags & SRE_INFO_CHARSET) {
1695 if (!_validate_charset(code, newcode-1))
1696 FAIL;
1697 if (newcode[-1] != SRE_OP_FAILURE)
1698 FAIL;
1699 code = newcode;
1700 }
1701 else if (code != newcode) {
1702 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1703 FAIL;
1704 }
1705 }
1706 break;
1707
1708 case SRE_OP_BRANCH:
1709 {
1710 SRE_CODE *target = NULL;
1711 for (;;) {
1712 GET_SKIP;
1713 if (skip == 0)
1714 break;
1715 /* Stop 2 before the end; we check the JUMP below */
1716 if (!_validate_inner(code, code+skip-3, groups))
1717 FAIL;
1718 code += skip-3;
1719 /* Check that it ends with a JUMP, and that each JUMP
1720 has the same target */
1721 GET_OP;
1722 if (op != SRE_OP_JUMP)
1723 FAIL;
1724 GET_SKIP;
1725 if (target == NULL)
1726 target = code+skip-1;
1727 else if (code+skip-1 != target)
1728 FAIL;
1729 }
1730 }
1731 break;
1732
1733 case SRE_OP_REPEAT_ONE:
1734 case SRE_OP_MIN_REPEAT_ONE:
1735 {
1736 SRE_CODE min, max;
1737 GET_SKIP;
1738 GET_ARG; min = arg;
1739 GET_ARG; max = arg;
1740 if (min > max)
1741 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001742 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001743 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001744 if (!_validate_inner(code, code+skip-4, groups))
1745 FAIL;
1746 code += skip-4;
1747 GET_OP;
1748 if (op != SRE_OP_SUCCESS)
1749 FAIL;
1750 }
1751 break;
1752
1753 case SRE_OP_REPEAT:
1754 {
1755 SRE_CODE min, max;
1756 GET_SKIP;
1757 GET_ARG; min = arg;
1758 GET_ARG; max = arg;
1759 if (min > max)
1760 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001761 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001762 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001763 if (!_validate_inner(code, code+skip-3, groups))
1764 FAIL;
1765 code += skip-3;
1766 GET_OP;
1767 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1768 FAIL;
1769 }
1770 break;
1771
1772 case SRE_OP_GROUPREF:
1773 case SRE_OP_GROUPREF_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001774 case SRE_OP_GROUPREF_UNI_IGNORE:
1775 case SRE_OP_GROUPREF_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001776 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001777 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001778 FAIL;
1779 break;
1780
1781 case SRE_OP_GROUPREF_EXISTS:
1782 /* The regex syntax for this is: '(?(group)then|else)', where
1783 'group' is either an integer group number or a group name,
1784 'then' and 'else' are sub-regexes, and 'else' is optional. */
1785 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001786 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001787 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001788 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001789 code--; /* The skip is relative to the first arg! */
1790 /* There are two possibilities here: if there is both a 'then'
1791 part and an 'else' part, the generated code looks like:
1792
1793 GROUPREF_EXISTS
1794 <group>
1795 <skipyes>
1796 ...then part...
1797 JUMP
1798 <skipno>
1799 (<skipyes> jumps here)
1800 ...else part...
1801 (<skipno> jumps here)
1802
1803 If there is only a 'then' part, it looks like:
1804
1805 GROUPREF_EXISTS
1806 <group>
1807 <skip>
1808 ...then part...
1809 (<skip> jumps here)
1810
1811 There is no direct way to decide which it is, and we don't want
1812 to allow arbitrary jumps anywhere in the code; so we just look
1813 for a JUMP opcode preceding our skip target.
1814 */
Benjamin Petersonca470632016-09-06 13:47:26 -07001815 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001816 code[skip-3] == SRE_OP_JUMP)
1817 {
1818 VTRACE(("both then and else parts present\n"));
1819 if (!_validate_inner(code+1, code+skip-3, groups))
1820 FAIL;
1821 code += skip-2; /* Position after JUMP, at <skipno> */
1822 GET_SKIP;
1823 if (!_validate_inner(code, code+skip-1, groups))
1824 FAIL;
1825 code += skip-1;
1826 }
1827 else {
1828 VTRACE(("only a then part present\n"));
1829 if (!_validate_inner(code+1, code+skip-1, groups))
1830 FAIL;
1831 code += skip-1;
1832 }
1833 break;
1834
1835 case SRE_OP_ASSERT:
1836 case SRE_OP_ASSERT_NOT:
1837 GET_SKIP;
1838 GET_ARG; /* 0 for lookahead, width for lookbehind */
1839 code--; /* Back up over arg to simplify math below */
1840 if (arg & 0x80000000)
1841 FAIL; /* Width too large */
1842 /* Stop 1 before the end; we check the SUCCESS below */
1843 if (!_validate_inner(code+1, code+skip-2, groups))
1844 FAIL;
1845 code += skip-2;
1846 GET_OP;
1847 if (op != SRE_OP_SUCCESS)
1848 FAIL;
1849 break;
1850
1851 default:
1852 FAIL;
1853
1854 }
1855 }
1856
1857 VTRACE(("okay\n"));
1858 return 1;
1859}
1860
1861static int
1862_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1863{
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001864 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1865 code >= end || end[-1] != SRE_OP_SUCCESS)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001866 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001867 return _validate_inner(code, end-1, groups);
1868}
1869
1870static int
1871_validate(PatternObject *self)
1872{
1873 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1874 {
1875 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1876 return 0;
1877 }
1878 else
1879 VTRACE(("Success!\n"));
1880 return 1;
1881}
1882
1883/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001884/* match methods */
1885
1886static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001887match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001888{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 Py_XDECREF(self->regs);
1890 Py_XDECREF(self->string);
1891 Py_DECREF(self->pattern);
1892 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001893}
1894
1895static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001896match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001897{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001898 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001899 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001900 Py_buffer view;
1901 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001902 const void* ptr;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001903 Py_ssize_t i, j;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001904
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001905 assert(0 <= index && index < self->groups);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001906 index *= 2;
1907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 if (self->string == Py_None || self->mark[index] < 0) {
1909 /* return default value if the string or group is undefined */
1910 Py_INCREF(def);
1911 return def;
1912 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001913
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001914 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001915 if (ptr == NULL)
1916 return NULL;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001917
1918 i = self->mark[index];
1919 j = self->mark[index+1];
1920 i = Py_MIN(i, length);
1921 j = Py_MIN(j, length);
1922 result = getslice(isbytes, ptr, self->string, i, j);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001923 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001924 PyBuffer_Release(&view);
1925 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001926}
1927
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001928static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001929match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001930{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001931 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001932
Guido van Rossumddefaf32007-01-14 03:31:43 +00001933 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001934 /* Default value */
1935 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00001936
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001937 if (PyIndex_Check(index)) {
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001938 i = PyNumber_AsSsize_t(index, NULL);
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001939 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001940 else {
1941 i = -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001942
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001943 if (self->pattern->groupindex) {
1944 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
1945 if (index && PyLong_Check(index)) {
1946 i = PyLong_AsSsize_t(index);
1947 }
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001948 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001949 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001950 if (i < 0 || i >= self->groups) {
1951 /* raise IndexError if we were given a bad group number */
1952 if (!PyErr_Occurred()) {
1953 PyErr_SetString(PyExc_IndexError, "no such group");
1954 }
1955 return -1;
1956 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00001957
1958 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001959}
1960
1961static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00001962match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001963{
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001964 Py_ssize_t i = match_getindex(self, index);
1965
1966 if (i < 0) {
1967 return NULL;
1968 }
1969
1970 return match_getslice_by_index(self, i, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00001971}
1972
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001973/*[clinic input]
1974_sre.SRE_Match.expand
1975
1976 template: object
1977
1978Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
1979[clinic start generated code]*/
1980
1981static PyObject *
1982_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
1983/*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001984{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001985 /* delegate to Python code */
1986 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001987 SRE_PY_MODULE, "_expand",
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001988 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001989 );
1990}
1991
1992static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001993match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00001994{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001995 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001996 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00001997
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001998 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00001999
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002000 switch (size) {
2001 case 0:
Serhiy Storchakaba85d692017-03-30 09:09:41 +03002002 result = match_getslice(self, _PyLong_Zero, Py_None);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002003 break;
2004 case 1:
2005 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2006 break;
2007 default:
2008 /* fetch multiple items */
2009 result = PyTuple_New(size);
2010 if (!result)
2011 return NULL;
2012 for (i = 0; i < size; i++) {
2013 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002014 self, PyTuple_GET_ITEM(args, i), Py_None
2015 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002016 if (!item) {
2017 Py_DECREF(result);
2018 return NULL;
2019 }
2020 PyTuple_SET_ITEM(result, i, item);
2021 }
2022 break;
2023 }
2024 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002025}
2026
Eric V. Smith605bdae2016-09-11 08:55:43 -04002027static PyObject*
2028match_getitem(MatchObject* self, PyObject* name)
2029{
2030 return match_getslice(self, name, Py_None);
2031}
2032
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002033/*[clinic input]
2034_sre.SRE_Match.groups
2035
2036 default: object = None
2037 Is used for groups that did not participate in the match.
2038
2039Return a tuple containing all the subgroups of the match, from 1.
2040[clinic start generated code]*/
2041
2042static PyObject *
2043_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2044/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002045{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002046 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002047 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002048
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002049 result = PyTuple_New(self->groups-1);
2050 if (!result)
2051 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 for (index = 1; index < self->groups; index++) {
2054 PyObject* item;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002055 item = match_getslice_by_index(self, index, default_value);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 if (!item) {
2057 Py_DECREF(result);
2058 return NULL;
2059 }
2060 PyTuple_SET_ITEM(result, index-1, item);
2061 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002063 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002064}
2065
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002066/*[clinic input]
2067_sre.SRE_Match.groupdict
2068
2069 default: object = None
2070 Is used for groups that did not participate in the match.
2071
2072Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2073[clinic start generated code]*/
2074
2075static PyObject *
2076_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2077/*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002078{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002079 PyObject *result;
2080 PyObject *key;
2081 PyObject *value;
2082 Py_ssize_t pos = 0;
2083 Py_hash_t hash;
Guido van Rossumb700df92000-03-31 14:59:30 +00002084
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 result = PyDict_New();
2086 if (!result || !self->pattern->groupindex)
2087 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002088
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002089 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002090 int status;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002091 Py_INCREF(key);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002092 value = match_getslice(self, key, default_value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002093 if (!value) {
2094 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002095 goto failed;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002096 }
2097 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002098 Py_DECREF(value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002099 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002100 if (status < 0)
2101 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002103
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002104 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002105
2106failed:
Fredrik Lundh770617b2001-01-14 15:06:11 +00002107 Py_DECREF(result);
2108 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002109}
2110
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002111/*[clinic input]
2112_sre.SRE_Match.start -> Py_ssize_t
2113
2114 group: object(c_default="NULL") = 0
2115 /
2116
2117Return index of the start of the substring matched by group.
2118[clinic start generated code]*/
2119
2120static Py_ssize_t
2121_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2122/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002123{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002124 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002125
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002126 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002127 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002128 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002129
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002130 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002131 return self->mark[index*2];
Guido van Rossumb700df92000-03-31 14:59:30 +00002132}
2133
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002134/*[clinic input]
2135_sre.SRE_Match.end -> Py_ssize_t
2136
2137 group: object(c_default="NULL") = 0
2138 /
2139
2140Return index of the end of the substring matched by group.
2141[clinic start generated code]*/
2142
2143static Py_ssize_t
2144_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2145/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002146{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002147 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002148
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002149 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002150 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002151 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002152
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002153 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002154 return self->mark[index*2+1];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002155}
2156
2157LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002158_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002159{
2160 PyObject* pair;
2161 PyObject* item;
2162
2163 pair = PyTuple_New(2);
2164 if (!pair)
2165 return NULL;
2166
Christian Heimes217cfd12007-12-02 14:31:20 +00002167 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002168 if (!item)
2169 goto error;
2170 PyTuple_SET_ITEM(pair, 0, item);
2171
Christian Heimes217cfd12007-12-02 14:31:20 +00002172 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002173 if (!item)
2174 goto error;
2175 PyTuple_SET_ITEM(pair, 1, item);
2176
2177 return pair;
2178
2179 error:
2180 Py_DECREF(pair);
2181 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002182}
2183
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002184/*[clinic input]
2185_sre.SRE_Match.span
2186
2187 group: object(c_default="NULL") = 0
2188 /
2189
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002190For match object m, return the 2-tuple (m.start(group), m.end(group)).
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002191[clinic start generated code]*/
2192
2193static PyObject *
2194_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002195/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002196{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002197 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002198
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002199 if (index < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002200 return NULL;
2201 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002202
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002203 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002204 return _pair(self->mark[index*2], self->mark[index*2+1]);
2205}
2206
2207static PyObject*
2208match_regs(MatchObject* self)
2209{
2210 PyObject* regs;
2211 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002212 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002213
2214 regs = PyTuple_New(self->groups);
2215 if (!regs)
2216 return NULL;
2217
2218 for (index = 0; index < self->groups; index++) {
2219 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2220 if (!item) {
2221 Py_DECREF(regs);
2222 return NULL;
2223 }
2224 PyTuple_SET_ITEM(regs, index, item);
2225 }
2226
2227 Py_INCREF(regs);
2228 self->regs = regs;
2229
2230 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002231}
2232
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002233/*[clinic input]
2234_sre.SRE_Match.__copy__
2235
2236[clinic start generated code]*/
2237
2238static PyObject *
2239_sre_SRE_Match___copy___impl(MatchObject *self)
2240/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002241{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002242 Py_INCREF(self);
2243 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002244}
2245
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002246/*[clinic input]
2247_sre.SRE_Match.__deepcopy__
2248
2249 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002250 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002251
2252[clinic start generated code]*/
2253
2254static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002255_sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2256/*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002257{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002258 Py_INCREF(self);
2259 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002260}
2261
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002262PyDoc_STRVAR(match_doc,
2263"The result of re.match() and re.search().\n\
2264Match objects always have a boolean value of True.");
2265
2266PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002267"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002268 Return subgroup(s) of the match by indices or names.\n\
2269 For 0 returns the entire match.");
2270
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002271static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002272match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
Guido van Rossumb700df92000-03-31 14:59:30 +00002273{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002274 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002275 return PyLong_FromSsize_t(self->lastindex);
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002276 Py_RETURN_NONE;
Guido van Rossumb700df92000-03-31 14:59:30 +00002277}
2278
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002279static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002280match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002281{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002282 if (self->pattern->indexgroup &&
2283 self->lastindex >= 0 &&
2284 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2285 {
2286 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2287 self->lastindex);
2288 Py_INCREF(result);
2289 return result;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002290 }
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002291 Py_RETURN_NONE;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002292}
2293
2294static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002295match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002296{
2297 if (self->regs) {
2298 Py_INCREF(self->regs);
2299 return self->regs;
2300 } else
2301 return match_regs(self);
2302}
2303
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002304static PyObject *
2305match_repr(MatchObject *self)
2306{
2307 PyObject *result;
2308 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2309 if (group0 == NULL)
2310 return NULL;
2311 result = PyUnicode_FromFormat(
sth8b91eda2019-03-10 11:29:14 +01002312 "<%s object; span=(%zd, %zd), match=%.50R>",
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002313 Py_TYPE(self)->tp_name,
2314 self->mark[0], self->mark[1], group0);
2315 Py_DECREF(group0);
2316 return result;
2317}
2318
2319
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002320static PyObject*
Victor Stinnerf5587782013-11-15 23:21:11 +01002321pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002322{
2323 /* create match object (from state object) */
2324
2325 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002326 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002327 char* base;
2328 int n;
2329
2330 if (status > 0) {
2331
2332 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002333 /* coverity[ampersand_in_size] */
Victor Stinner92055202020-04-08 00:38:15 +02002334 match = PyObject_NewVar(MatchObject, &Match_Type,
2335 2*(pattern->groups+1));
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002336 if (!match)
2337 return NULL;
2338
2339 Py_INCREF(pattern);
2340 match->pattern = pattern;
2341
2342 Py_INCREF(state->string);
2343 match->string = state->string;
2344
2345 match->regs = NULL;
2346 match->groups = pattern->groups+1;
2347
2348 /* fill in group slices */
2349
2350 base = (char*) state->beginning;
2351 n = state->charsize;
2352
2353 match->mark[0] = ((char*) state->start - base) / n;
2354 match->mark[1] = ((char*) state->ptr - base) / n;
2355
2356 for (i = j = 0; i < pattern->groups; i++, j+=2)
2357 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2358 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2359 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2360 } else
2361 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2362
2363 match->pos = state->pos;
2364 match->endpos = state->endpos;
2365
2366 match->lastindex = state->lastindex;
2367
2368 return (PyObject*) match;
2369
2370 } else if (status == 0) {
2371
2372 /* no match */
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002373 Py_RETURN_NONE;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002374
2375 }
2376
2377 /* internal error */
2378 pattern_error(status);
2379 return NULL;
2380}
2381
2382
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002383/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002384/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002385
2386static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002387scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002388{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002389 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002390 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002391 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002392}
2393
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002394/*[clinic input]
2395_sre.SRE_Scanner.match
2396
2397[clinic start generated code]*/
2398
2399static PyObject *
2400_sre_SRE_Scanner_match_impl(ScannerObject *self)
2401/*[clinic end generated code: output=936b30c63d4b81eb input=881a0154f8c13d9a]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002402{
2403 SRE_STATE* state = &self->state;
2404 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002405 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002406
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002407 if (state->start == NULL)
2408 Py_RETURN_NONE;
2409
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002410 state_reset(state);
2411
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002412 state->ptr = state->start;
2413
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002414 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002415 if (PyErr_Occurred())
2416 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002417
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002418 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002419 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002420
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002421 if (status == 0)
2422 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002423 else {
2424 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002425 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002426 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002427
2428 return match;
2429}
2430
2431
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002432/*[clinic input]
2433_sre.SRE_Scanner.search
2434
2435[clinic start generated code]*/
2436
2437static PyObject *
2438_sre_SRE_Scanner_search_impl(ScannerObject *self)
2439/*[clinic end generated code: output=7dc211986088f025 input=161223ee92ef9270]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002440{
2441 SRE_STATE* state = &self->state;
2442 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002443 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002444
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002445 if (state->start == NULL)
2446 Py_RETURN_NONE;
2447
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002448 state_reset(state);
2449
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002450 state->ptr = state->start;
2451
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002452 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002453 if (PyErr_Occurred())
2454 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002455
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002456 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002457 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002458
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002459 if (status == 0)
2460 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002461 else {
2462 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002463 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002464 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002465
2466 return match;
2467}
2468
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002469static PyObject *
2470pattern_scanner(PatternObject *self, PyObject *string, Py_ssize_t pos, Py_ssize_t endpos)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002471{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002472 ScannerObject* scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473
2474 /* create scanner object */
Victor Stinner92055202020-04-08 00:38:15 +02002475 scanner = PyObject_New(ScannerObject, &Scanner_Type);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002476 if (!scanner)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002477 return NULL;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002478 scanner->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002479
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002480 /* create search state object */
2481 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2482 Py_DECREF(scanner);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002483 return NULL;
2484 }
2485
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002486 Py_INCREF(self);
2487 scanner->pattern = (PyObject*) self;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002488
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002489 return (PyObject*) scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002490}
2491
Victor Stinnerb44fb122016-11-21 16:35:08 +01002492static Py_hash_t
2493pattern_hash(PatternObject *self)
2494{
2495 Py_hash_t hash, hash2;
2496
2497 hash = PyObject_Hash(self->pattern);
2498 if (hash == -1) {
2499 return -1;
2500 }
2501
2502 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2503 hash ^= hash2;
2504
2505 hash ^= self->flags;
2506 hash ^= self->isbytes;
2507 hash ^= self->codesize;
2508
2509 if (hash == -1) {
2510 hash = -2;
2511 }
2512 return hash;
2513}
2514
2515static PyObject*
2516pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2517{
2518 PatternObject *left, *right;
2519 int cmp;
2520
2521 if (op != Py_EQ && op != Py_NE) {
2522 Py_RETURN_NOTIMPLEMENTED;
2523 }
2524
Dong-hee Na1b55b652020-02-17 19:09:15 +09002525 if (!Py_IS_TYPE(lefto, &Pattern_Type) || !Py_IS_TYPE(righto, &Pattern_Type)) {
Victor Stinnerb44fb122016-11-21 16:35:08 +01002526 Py_RETURN_NOTIMPLEMENTED;
2527 }
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01002528
2529 if (lefto == righto) {
2530 /* a pattern is equal to itself */
2531 return PyBool_FromLong(op == Py_EQ);
2532 }
2533
Victor Stinnerb44fb122016-11-21 16:35:08 +01002534 left = (PatternObject *)lefto;
2535 right = (PatternObject *)righto;
2536
2537 cmp = (left->flags == right->flags
2538 && left->isbytes == right->isbytes
Victor Stinnere670b2d2016-11-22 15:23:00 +01002539 && left->codesize == right->codesize);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002540 if (cmp) {
2541 /* Compare the code and the pattern because the same pattern can
2542 produce different codes depending on the locale used to compile the
2543 pattern when the re.LOCALE flag is used. Don't compare groups,
2544 indexgroup nor groupindex: they are derivated from the pattern. */
2545 cmp = (memcmp(left->code, right->code,
2546 sizeof(left->code[0]) * left->codesize) == 0);
2547 }
2548 if (cmp) {
2549 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2550 Py_EQ);
2551 if (cmp < 0) {
2552 return NULL;
2553 }
2554 }
2555 if (op == Py_NE) {
2556 cmp = !cmp;
2557 }
2558 return PyBool_FromLong(cmp);
2559}
2560
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002561#include "clinic/_sre.c.h"
2562
2563static PyMethodDef pattern_methods[] = {
2564 _SRE_SRE_PATTERN_MATCH_METHODDEF
2565 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2566 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2567 _SRE_SRE_PATTERN_SUB_METHODDEF
2568 _SRE_SRE_PATTERN_SUBN_METHODDEF
2569 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2570 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2571 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2572 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2573 _SRE_SRE_PATTERN___COPY___METHODDEF
2574 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002575 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2576 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002577 {NULL, NULL}
2578};
2579
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002580static PyGetSetDef pattern_getset[] = {
2581 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2582 "A dictionary mapping group names to group numbers."},
2583 {NULL} /* Sentinel */
2584};
2585
2586#define PAT_OFF(x) offsetof(PatternObject, x)
2587static PyMemberDef pattern_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002588 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2589 "The pattern string from which the RE object was compiled."},
2590 {"flags", T_INT, PAT_OFF(flags), READONLY,
2591 "The regex matching flags."},
2592 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2593 "The number of capturing groups in the pattern."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002594 {NULL} /* Sentinel */
2595};
2596
2597static PyTypeObject Pattern_Type = {
2598 PyVarObject_HEAD_INIT(NULL, 0)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002599 "re.Pattern",
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002600 sizeof(PatternObject), sizeof(SRE_CODE),
2601 (destructor)pattern_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002602 0, /* tp_vectorcall_offset */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002603 0, /* tp_getattr */
2604 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002605 0, /* tp_as_async */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002606 (reprfunc)pattern_repr, /* tp_repr */
2607 0, /* tp_as_number */
2608 0, /* tp_as_sequence */
2609 0, /* tp_as_mapping */
Victor Stinnerb44fb122016-11-21 16:35:08 +01002610 (hashfunc)pattern_hash, /* tp_hash */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002611 0, /* tp_call */
2612 0, /* tp_str */
2613 0, /* tp_getattro */
2614 0, /* tp_setattro */
2615 0, /* tp_as_buffer */
2616 Py_TPFLAGS_DEFAULT, /* tp_flags */
2617 pattern_doc, /* tp_doc */
2618 0, /* tp_traverse */
2619 0, /* tp_clear */
Victor Stinnerb44fb122016-11-21 16:35:08 +01002620 pattern_richcompare, /* tp_richcompare */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002621 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2622 0, /* tp_iter */
2623 0, /* tp_iternext */
2624 pattern_methods, /* tp_methods */
2625 pattern_members, /* tp_members */
2626 pattern_getset, /* tp_getset */
2627};
2628
Eric V. Smith605bdae2016-09-11 08:55:43 -04002629/* Match objects do not support length or assignment, but do support
2630 __getitem__. */
2631static PyMappingMethods match_as_mapping = {
2632 NULL,
2633 (binaryfunc)match_getitem,
2634 NULL
2635};
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002636
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002637static PyMethodDef match_methods[] = {
2638 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2639 _SRE_SRE_MATCH_START_METHODDEF
2640 _SRE_SRE_MATCH_END_METHODDEF
2641 _SRE_SRE_MATCH_SPAN_METHODDEF
2642 _SRE_SRE_MATCH_GROUPS_METHODDEF
2643 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2644 _SRE_SRE_MATCH_EXPAND_METHODDEF
2645 _SRE_SRE_MATCH___COPY___METHODDEF
2646 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002647 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2648 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002649 {NULL, NULL}
2650};
2651
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002652static PyGetSetDef match_getset[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002653 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2654 "The integer index of the last matched capturing group."},
2655 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2656 "The name of the last matched capturing group."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002657 {"regs", (getter)match_regs_get, (setter)NULL},
2658 {NULL}
2659};
2660
2661#define MATCH_OFF(x) offsetof(MatchObject, x)
2662static PyMemberDef match_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002663 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2664 "The string passed to match() or search()."},
2665 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2666 "The regular expression object."},
2667 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2668 "The index into the string at which the RE engine started looking for a match."},
2669 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2670 "The index into the string beyond which the RE engine will not go."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002671 {NULL}
2672};
2673
2674/* FIXME: implement setattr("string", None) as a special case (to
2675 detach the associated string, if any */
2676
2677static PyTypeObject Match_Type = {
2678 PyVarObject_HEAD_INIT(NULL,0)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002679 "re.Match",
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002680 sizeof(MatchObject), sizeof(Py_ssize_t),
2681 (destructor)match_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002682 0, /* tp_vectorcall_offset */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002683 0, /* tp_getattr */
2684 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002685 0, /* tp_as_async */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002686 (reprfunc)match_repr, /* tp_repr */
2687 0, /* tp_as_number */
2688 0, /* tp_as_sequence */
Eric V. Smith605bdae2016-09-11 08:55:43 -04002689 &match_as_mapping, /* tp_as_mapping */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002690 0, /* tp_hash */
2691 0, /* tp_call */
2692 0, /* tp_str */
2693 0, /* tp_getattro */
2694 0, /* tp_setattro */
2695 0, /* tp_as_buffer */
2696 Py_TPFLAGS_DEFAULT, /* tp_flags */
2697 match_doc, /* tp_doc */
2698 0, /* tp_traverse */
2699 0, /* tp_clear */
2700 0, /* tp_richcompare */
2701 0, /* tp_weaklistoffset */
2702 0, /* tp_iter */
2703 0, /* tp_iternext */
2704 match_methods, /* tp_methods */
2705 match_members, /* tp_members */
2706 match_getset, /* tp_getset */
2707};
2708
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002709static PyMethodDef scanner_methods[] = {
2710 _SRE_SRE_SCANNER_MATCH_METHODDEF
2711 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2712 {NULL, NULL}
2713};
2714
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002715#define SCAN_OFF(x) offsetof(ScannerObject, x)
2716static PyMemberDef scanner_members[] = {
2717 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2718 {NULL} /* Sentinel */
2719};
2720
2721static PyTypeObject Scanner_Type = {
2722 PyVarObject_HEAD_INIT(NULL, 0)
2723 "_" SRE_MODULE ".SRE_Scanner",
2724 sizeof(ScannerObject), 0,
2725 (destructor)scanner_dealloc,/* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002726 0, /* tp_vectorcall_offset */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002727 0, /* tp_getattr */
2728 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +02002729 0, /* tp_as_async */
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002730 0, /* tp_repr */
2731 0, /* tp_as_number */
2732 0, /* tp_as_sequence */
2733 0, /* tp_as_mapping */
2734 0, /* tp_hash */
2735 0, /* tp_call */
2736 0, /* tp_str */
2737 0, /* tp_getattro */
2738 0, /* tp_setattro */
2739 0, /* tp_as_buffer */
2740 Py_TPFLAGS_DEFAULT, /* tp_flags */
2741 0, /* tp_doc */
2742 0, /* tp_traverse */
2743 0, /* tp_clear */
2744 0, /* tp_richcompare */
2745 0, /* tp_weaklistoffset */
2746 0, /* tp_iter */
2747 0, /* tp_iternext */
2748 scanner_methods, /* tp_methods */
2749 scanner_members, /* tp_members */
2750 0, /* tp_getset */
2751};
2752
Guido van Rossumb700df92000-03-31 14:59:30 +00002753static PyMethodDef _functions[] = {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002754 _SRE_COMPILE_METHODDEF
2755 _SRE_GETCODESIZE_METHODDEF
Serhiy Storchaka6d336a02017-05-09 23:37:14 +03002756 _SRE_ASCII_ISCASED_METHODDEF
2757 _SRE_UNICODE_ISCASED_METHODDEF
Serhiy Storchaka7186cc22017-05-05 10:42:46 +03002758 _SRE_ASCII_TOLOWER_METHODDEF
2759 _SRE_UNICODE_TOLOWER_METHODDEF
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002760 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002761};
2762
Martin v. Löwis1a214512008-06-11 05:26:20 +00002763static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002764 PyModuleDef_HEAD_INIT,
2765 "_" SRE_MODULE,
2766 NULL,
2767 -1,
2768 _functions,
2769 NULL,
2770 NULL,
2771 NULL,
2772 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00002773};
2774
2775PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002776{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002777 PyObject* m;
2778 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00002779 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002780
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00002781 /* Patch object types */
2782 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
2783 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00002784 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002785
Martin v. Löwis1a214512008-06-11 05:26:20 +00002786 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00002787 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002788 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00002789 d = PyModule_GetDict(m);
2790
Christian Heimes217cfd12007-12-02 14:31:20 +00002791 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002792 if (x) {
2793 PyDict_SetItemString(d, "MAGIC", x);
2794 Py_DECREF(x);
2795 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00002796
Christian Heimes217cfd12007-12-02 14:31:20 +00002797 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00002798 if (x) {
2799 PyDict_SetItemString(d, "CODESIZE", x);
2800 Py_DECREF(x);
2801 }
2802
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02002803 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
2804 if (x) {
2805 PyDict_SetItemString(d, "MAXREPEAT", x);
2806 Py_DECREF(x);
2807 }
2808
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03002809 x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
2810 if (x) {
2811 PyDict_SetItemString(d, "MAXGROUPS", x);
2812 Py_DECREF(x);
2813 }
2814
Neal Norwitzfe537132007-08-26 03:55:15 +00002815 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00002816 if (x) {
2817 PyDict_SetItemString(d, "copyright", x);
2818 Py_DECREF(x);
2819 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00002820 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00002821}
2822
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002823/* vim:ts=4:sw=4:et
2824*/