blob: 7b21240e9a0339ac9758c5ddd7c478137ae1f392 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020038static const char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Victor Stinner37834132020-10-27 17:12:53 +010044#include "pycore_long.h" // _PyLong_GetZero()
Victor Stinnercdad2722021-04-22 00:52:52 +020045#include "pycore_moduleobject.h" // _PyModule_GetState()
Victor Stinner4a21e572020-04-15 02:35:41 +020046#include "structmember.h" // PyMemberDef
Guido van Rossumb700df92000-03-31 14:59:30 +000047
48#include "sre.h"
49
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030050#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000052#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000053
Fredrik Lundh436c3d582000-06-29 08:58:44 +000054/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000055#if !defined(SRE_MODULE)
56#define SRE_MODULE "sre"
57#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000058
Thomas Wouters9ada3d62006-04-21 09:47:09 +000059#define SRE_PY_MODULE "re"
60
Guido van Rossumb700df92000-03-31 14:59:30 +000061/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000065
Fredrik Lundh80946112000-06-29 18:03:25 +000066#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000067#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000068#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000069/* fastest possible local call under MSVC */
70#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071#else
Benjamin Peterson791dc832017-04-20 23:52:19 -070072#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000073#endif
74
75/* error codes */
76#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000077#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000078#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000079#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000080#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000081
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000084#else
85#define TRACE(v)
86#endif
87
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000088/* -------------------------------------------------------------------- */
89/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000090
Fredrik Lundh436c3d582000-06-29 08:58:44 +000091#define SRE_IS_DIGIT(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050092 ((ch) <= '9' && Py_ISDIGIT(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000093#define SRE_IS_SPACE(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050094 ((ch) <= ' ' && Py_ISSPACE(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000095#define SRE_IS_LINEBREAK(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +030096 ((ch) == '\n')
Fredrik Lundh436c3d582000-06-29 08:58:44 +000097#define SRE_IS_WORD(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050098 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300100static unsigned int sre_lower_ascii(unsigned int ch)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000101{
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300102 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000103}
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000106/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
107 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000108#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000109#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
110
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000111static unsigned int sre_lower_locale(unsigned int ch)
112{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000113 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000114}
115
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200116static unsigned int sre_upper_locale(unsigned int ch)
117{
118 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
119}
120
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000121/* unicode-specific character predicates */
122
Victor Stinner0058b862011-09-29 03:27:47 +0200123#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
124#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
125#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
126#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
127#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000128
129static unsigned int sre_lower_unicode(unsigned int ch)
130{
Victor Stinner0058b862011-09-29 03:27:47 +0200131 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000132}
133
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200134static unsigned int sre_upper_unicode(unsigned int ch)
135{
136 return (unsigned int) Py_UNICODE_TOUPPER(ch);
137}
138
Guido van Rossumb700df92000-03-31 14:59:30 +0000139LOCAL(int)
140sre_category(SRE_CODE category, unsigned int ch)
141{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000142 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000143
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000144 case SRE_CATEGORY_DIGIT:
145 return SRE_IS_DIGIT(ch);
146 case SRE_CATEGORY_NOT_DIGIT:
147 return !SRE_IS_DIGIT(ch);
148 case SRE_CATEGORY_SPACE:
149 return SRE_IS_SPACE(ch);
150 case SRE_CATEGORY_NOT_SPACE:
151 return !SRE_IS_SPACE(ch);
152 case SRE_CATEGORY_WORD:
153 return SRE_IS_WORD(ch);
154 case SRE_CATEGORY_NOT_WORD:
155 return !SRE_IS_WORD(ch);
156 case SRE_CATEGORY_LINEBREAK:
157 return SRE_IS_LINEBREAK(ch);
158 case SRE_CATEGORY_NOT_LINEBREAK:
159 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000161 case SRE_CATEGORY_LOC_WORD:
162 return SRE_LOC_IS_WORD(ch);
163 case SRE_CATEGORY_LOC_NOT_WORD:
164 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000165
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000166 case SRE_CATEGORY_UNI_DIGIT:
167 return SRE_UNI_IS_DIGIT(ch);
168 case SRE_CATEGORY_UNI_NOT_DIGIT:
169 return !SRE_UNI_IS_DIGIT(ch);
170 case SRE_CATEGORY_UNI_SPACE:
171 return SRE_UNI_IS_SPACE(ch);
172 case SRE_CATEGORY_UNI_NOT_SPACE:
173 return !SRE_UNI_IS_SPACE(ch);
174 case SRE_CATEGORY_UNI_WORD:
175 return SRE_UNI_IS_WORD(ch);
176 case SRE_CATEGORY_UNI_NOT_WORD:
177 return !SRE_UNI_IS_WORD(ch);
178 case SRE_CATEGORY_UNI_LINEBREAK:
179 return SRE_UNI_IS_LINEBREAK(ch);
180 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
181 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000182 }
183 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000184}
185
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300186LOCAL(int)
187char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
188{
189 return ch == pattern
190 || (SRE_CODE) sre_lower_locale(ch) == pattern
191 || (SRE_CODE) sre_upper_locale(ch) == pattern;
192}
193
194
Guido van Rossumb700df92000-03-31 14:59:30 +0000195/* helpers */
196
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000197static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000198data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000199{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000200 if (state->data_stack) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100201 PyMem_Free(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000202 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000203 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000204 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000205}
206
207static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000208data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000209{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000210 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000211 minsize = state->data_stack_base+size;
212 cursize = state->data_stack_size;
213 if (cursize < minsize) {
214 void* stack;
215 cursize = minsize+minsize/4+1024;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200216 TRACE(("allocate/grow stack %zd\n", cursize));
Victor Stinner00d7abd2020-12-01 09:56:42 +0100217 stack = PyMem_Realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000218 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000219 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000220 return SRE_ERROR_MEMORY;
221 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000222 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000223 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000224 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000225 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000226}
227
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000228/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000229
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300230#define SRE_CHAR Py_UCS1
231#define SIZEOF_SRE_CHAR 1
232#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300233#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000234
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300235/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000236
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300237#define SRE_CHAR Py_UCS2
238#define SIZEOF_SRE_CHAR 2
239#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300240#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000241
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300242/* generate 32-bit unicode version */
243
244#define SRE_CHAR Py_UCS4
245#define SIZEOF_SRE_CHAR 4
246#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300247#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000248
249/* -------------------------------------------------------------------- */
250/* factories and destructors */
251
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100252/* module state */
253typedef struct {
254 PyTypeObject *Pattern_Type;
255 PyTypeObject *Match_Type;
256 PyTypeObject *Scanner_Type;
257} _sremodulestate;
Guido van Rossumb700df92000-03-31 14:59:30 +0000258
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100259static _sremodulestate *
260get_sre_module_state(PyObject *m)
261{
Victor Stinnercdad2722021-04-22 00:52:52 +0200262 _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100263 assert(state);
264 return state;
265}
266
267static struct PyModuleDef sremodule;
268#define get_sre_module_state_by_class(cls) \
269 (get_sre_module_state(PyType_GetModule(cls)))
270
271/* see sre.h for object declarations */
272static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
273static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300274
275/*[clinic input]
276module _sre
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100277class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
278class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
279class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300280[clinic start generated code]*/
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100281/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
Larry Hastings2d0a69a2015-05-03 14:49:19 -0700282
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300283/*[clinic input]
284_sre.getcodesize -> int
285[clinic start generated code]*/
286
287static int
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +0300288_sre_getcodesize_impl(PyObject *module)
289/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000290{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300291 return sizeof(SRE_CODE);
Guido van Rossumb700df92000-03-31 14:59:30 +0000292}
293
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300294/*[clinic input]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300295_sre.ascii_iscased -> bool
296
297 character: int
298 /
299
300[clinic start generated code]*/
301
302static int
303_sre_ascii_iscased_impl(PyObject *module, int character)
304/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
305{
306 unsigned int ch = (unsigned int)character;
Sergey Fedoseev7f0d59f2018-09-12 17:49:09 +0500307 return ch < 128 && Py_ISALPHA(ch);
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300308}
309
310/*[clinic input]
311_sre.unicode_iscased -> bool
312
313 character: int
314 /
315
316[clinic start generated code]*/
317
318static int
319_sre_unicode_iscased_impl(PyObject *module, int character)
320/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
321{
322 unsigned int ch = (unsigned int)character;
323 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
324}
325
326/*[clinic input]
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300327_sre.ascii_tolower -> int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300328
329 character: int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300330 /
331
332[clinic start generated code]*/
333
334static int
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300335_sre_ascii_tolower_impl(PyObject *module, int character)
336/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000337{
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300338 return sre_lower_ascii(character);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000339}
340
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300341/*[clinic input]
342_sre.unicode_tolower -> int
343
344 character: int
345 /
346
347[clinic start generated code]*/
348
349static int
350_sre_unicode_tolower_impl(PyObject *module, int character)
351/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
352{
353 return sre_lower_unicode(character);
354}
355
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000356LOCAL(void)
357state_reset(SRE_STATE* state)
358{
animalize4a7f44a2019-02-18 21:26:37 +0800359 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000360 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000361
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000362 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000363 state->lastindex = -1;
364
365 state->repeat = NULL;
366
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000367 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000368}
369
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300370static const void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300372 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600373 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000374{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000375 /* given a python object, return a data pointer, a length (in
376 characters), and a character size. return NULL if the object
377 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000378
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000379 /* Unicode objects do not support the buffer API. So, get the data
380 directly instead. */
381 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 if (PyUnicode_READY(string) == -1)
383 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200385 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300386 *p_isbytes = 0;
387 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000388 }
389
Victor Stinner0058b862011-09-29 03:27:47 +0200390 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300391 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200392 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300393 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000394 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000395
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300396 *p_length = view->len;
397 *p_charsize = 1;
398 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000399
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300400 if (view->buf == NULL) {
401 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
402 PyBuffer_Release(view);
403 view->buf = NULL;
404 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300406 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000407}
408
409LOCAL(PyObject*)
410state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000411 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000412{
413 /* prepare state object */
414
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000415 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300416 int isbytes, charsize;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300417 const void* ptr;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000418
419 memset(state, 0, sizeof(SRE_STATE));
420
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300421 state->mark = PyMem_New(const void *, pattern->groups * 2);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300422 if (!state->mark) {
423 PyErr_NoMemory();
424 goto err;
425 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000426 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000427 state->lastindex = -1;
428
Benjamin Petersone48944b2012-03-07 14:50:25 -0600429 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300430 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000431 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600432 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000433
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300434 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600435 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200436 "cannot use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600437 goto err;
438 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300439 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600440 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200441 "cannot use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600442 goto err;
443 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000444
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 /* adjust boundaries */
446 if (start < 0)
447 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000448 else if (start > length)
449 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000450
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000451 if (end < 0)
452 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000453 else if (end > length)
454 end = length;
455
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300456 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000457 state->charsize = charsize;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200458 state->match_all = 0;
459 state->must_advance = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000460
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000461 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000462
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000463 state->start = (void*) ((char*) ptr + start * state->charsize);
464 state->end = (void*) ((char*) ptr + end * state->charsize);
465
466 Py_INCREF(string);
467 state->string = string;
468 state->pos = start;
469 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000470
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000471 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600472 err:
Ammar Askar06e3a272020-06-01 17:21:43 +0000473 /* We add an explicit cast here because MSVC has a bug when
474 compiling C code where it believes that `const void**` cannot be
475 safely casted to `void*`, see bpo-39943 for details. */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100476 PyMem_Free((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300477 state->mark = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600478 if (state->buffer.buf)
479 PyBuffer_Release(&state->buffer);
480 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000481}
482
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000483LOCAL(void)
484state_fini(SRE_STATE* state)
485{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600486 if (state->buffer.buf)
487 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000488 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000489 data_stack_dealloc(state);
Ammar Askar06e3a272020-06-01 17:21:43 +0000490 /* See above PyMem_Del for why we explicitly cast here. */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100491 PyMem_Free((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300492 state->mark = NULL;
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000493}
494
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000495/* calculate offset from start of string */
496#define STATE_OFFSET(state, member)\
497 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
498
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000499LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300500getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300501 PyObject* string, Py_ssize_t start, Py_ssize_t end)
502{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300503 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300504 if (PyBytes_CheckExact(string) &&
505 start == 0 && end == PyBytes_GET_SIZE(string)) {
506 Py_INCREF(string);
507 return string;
508 }
509 return PyBytes_FromStringAndSize(
510 (const char *)ptr + start, end - start);
511 }
512 else {
513 return PyUnicode_Substring(string, start, end);
514 }
515}
516
517LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000518state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000519{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000520 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000521
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000522 index = (index - 1) * 2;
523
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000524 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000525 if (empty)
526 /* want empty string */
527 i = j = 0;
528 else {
Serhiy Storchaka228b12e2017-01-23 09:47:21 +0200529 Py_RETURN_NONE;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000530 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000531 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000532 i = STATE_OFFSET(state, state->mark[index]);
533 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000534 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000535
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300536 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000537}
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100540pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541{
542 switch (status) {
543 case SRE_ERROR_RECURSION_LIMIT:
Yury Selivanovf488fb42015-07-03 01:04:23 -0400544 /* This error code seems to be unused. */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 PyErr_SetString(
Yury Selivanovf488fb42015-07-03 01:04:23 -0400546 PyExc_RecursionError,
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000547 "maximum recursion limit exceeded"
548 );
549 break;
550 case SRE_ERROR_MEMORY:
551 PyErr_NoMemory();
552 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000553 case SRE_ERROR_INTERRUPTED:
554 /* An exception has already been raised, so let it fly */
555 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000556 default:
557 /* other error codes indicate compiler/engine bugs */
558 PyErr_SetString(
559 PyExc_RuntimeError,
560 "internal error in regular expression engine"
561 );
562 }
563}
564
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -0700565static int
566pattern_traverse(PatternObject *self, visitproc visit, void *arg)
567{
568 Py_VISIT(Py_TYPE(self));
569 Py_VISIT(self->groupindex);
570 Py_VISIT(self->indexgroup);
571 Py_VISIT(self->pattern);
572 return 0;
573}
574
575static int
576pattern_clear(PatternObject *self)
577{
578 Py_CLEAR(self->groupindex);
579 Py_CLEAR(self->indexgroup);
580 Py_CLEAR(self->pattern);
581 return 0;
582}
583
Guido van Rossumb700df92000-03-31 14:59:30 +0000584static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000585pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000586{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100587 PyTypeObject *tp = Py_TYPE(self);
588
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -0700589 PyObject_GC_UnTrack(self);
590 if (self->weakreflist != NULL) {
Raymond Hettinger027bb632004-05-31 03:09:25 +0000591 PyObject_ClearWeakRefs((PyObject *) self);
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -0700592 }
593 (void)pattern_clear(self);
594 tp->tp_free(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100595 Py_DECREF(tp);
Guido van Rossumb700df92000-03-31 14:59:30 +0000596}
597
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300598LOCAL(Py_ssize_t)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200599sre_match(SRE_STATE* state, SRE_CODE* pattern)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300600{
601 if (state->charsize == 1)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200602 return sre_ucs1_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300603 if (state->charsize == 2)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200604 return sre_ucs2_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300605 assert(state->charsize == 4);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200606 return sre_ucs4_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300607}
608
609LOCAL(Py_ssize_t)
610sre_search(SRE_STATE* state, SRE_CODE* pattern)
611{
612 if (state->charsize == 1)
613 return sre_ucs1_search(state, pattern);
614 if (state->charsize == 2)
615 return sre_ucs2_search(state, pattern);
616 assert(state->charsize == 4);
617 return sre_ucs4_search(state, pattern);
618}
619
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300620/*[clinic input]
621_sre.SRE_Pattern.match
622
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100623 cls: defining_class
624 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200625 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300626 pos: Py_ssize_t = 0
627 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300628
629Matches zero or more characters at the beginning of the string.
630[clinic start generated code]*/
631
Larry Hastings16c51912014-01-07 11:53:01 -0800632static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100633_sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
634 PyObject *string, Py_ssize_t pos,
635 Py_ssize_t endpos)
636/*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
Larry Hastings16c51912014-01-07 11:53:01 -0800637{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100638 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000639 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100640 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300641 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000642
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300643 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000644 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000645
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000646 state.ptr = state.start;
647
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000648 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
649
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200650 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000651
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000652 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300653 if (PyErr_Occurred()) {
654 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000655 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300656 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000657
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100658 match = pattern_new_match(module_state, self, &state, status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000659 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300660 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000661}
662
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300663/*[clinic input]
664_sre.SRE_Pattern.fullmatch
665
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100666 cls: defining_class
667 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200668 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300669 pos: Py_ssize_t = 0
670 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300671
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300672Matches against all of the string.
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300673[clinic start generated code]*/
674
675static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100676_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
677 PyObject *string, Py_ssize_t pos,
678 Py_ssize_t endpos)
679/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200680{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100681 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200682 SRE_STATE state;
683 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300684 PyObject *match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200685
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300686 if (!state_init(&state, self, string, pos, endpos))
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200687 return NULL;
688
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200689 state.ptr = state.start;
690
691 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
692
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200693 state.match_all = 1;
694 status = sre_match(&state, PatternObject_GetCode(self));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200695
696 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300697 if (PyErr_Occurred()) {
698 state_fini(&state);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200699 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300700 }
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200701
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100702 match = pattern_new_match(module_state, self, &state, status);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200703 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300704 return match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200705}
706
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300707/*[clinic input]
708_sre.SRE_Pattern.search
709
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100710 cls: defining_class
711 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200712 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300713 pos: Py_ssize_t = 0
714 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300715
716Scan through string looking for a match, and return a corresponding match object instance.
717
718Return None if no position in the string matches.
719[clinic start generated code]*/
720
721static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100722_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
723 PyObject *string, Py_ssize_t pos,
724 Py_ssize_t endpos)
725/*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000726{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100727 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000728 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100729 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300730 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000731
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300732 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000733 return NULL;
734
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000735 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
736
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300737 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000738
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000739 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
740
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300741 if (PyErr_Occurred()) {
742 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000743 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300744 }
Thomas Wouters89f507f2006-12-13 04:49:30 +0000745
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100746 match = pattern_new_match(module_state, self, &state, status);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300747 state_fini(&state);
748 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000749}
750
751static PyObject*
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200752call(const char* module, const char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000753{
754 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000755 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000756 PyObject* func;
757 PyObject* result;
758
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000759 if (!args)
760 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000761 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000762 if (!name)
763 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000764 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000765 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000766 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000767 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000768 func = PyObject_GetAttrString(mod, function);
769 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000770 if (!func)
771 return NULL;
772 result = PyObject_CallObject(func, args);
773 Py_DECREF(func);
774 Py_DECREF(args);
775 return result;
776}
777
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300778/*[clinic input]
779_sre.SRE_Pattern.findall
780
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200781 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300782 pos: Py_ssize_t = 0
783 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300784
785Return a list of all non-overlapping matches of pattern in string.
786[clinic start generated code]*/
787
788static PyObject *
789_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200790 Py_ssize_t pos, Py_ssize_t endpos)
791/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000792{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000793 SRE_STATE state;
794 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100795 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000796 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000797
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300798 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000799 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000800
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000801 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000802 if (!list) {
803 state_fini(&state);
804 return NULL;
805 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000806
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000807 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000808
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000809 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000810
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000811 state_reset(&state);
812
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000813 state.ptr = state.start;
814
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300815 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300816 if (PyErr_Occurred())
817 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000818
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000819 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000820 if (status == 0)
821 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000822 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000823 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000824 }
Tim Peters3d563502006-01-21 02:47:53 +0000825
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000826 /* don't bother to build a match object */
827 switch (self->groups) {
828 case 0:
829 b = STATE_OFFSET(&state, state.start);
830 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300831 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300832 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000833 if (!item)
834 goto error;
835 break;
836 case 1:
837 item = state_getslice(&state, 1, string, 1);
838 if (!item)
839 goto error;
840 break;
841 default:
842 item = PyTuple_New(self->groups);
843 if (!item)
844 goto error;
845 for (i = 0; i < self->groups; i++) {
846 PyObject* o = state_getslice(&state, i+1, string, 1);
847 if (!o) {
848 Py_DECREF(item);
849 goto error;
850 }
851 PyTuple_SET_ITEM(item, i, o);
852 }
853 break;
854 }
855
856 status = PyList_Append(list, item);
857 Py_DECREF(item);
858 if (status < 0)
859 goto error;
860
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200861 state.must_advance = (state.ptr == state.start);
862 state.start = state.ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000863 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000864
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000865 state_fini(&state);
866 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000867
868error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000869 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000870 state_fini(&state);
871 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000872
Guido van Rossumb700df92000-03-31 14:59:30 +0000873}
874
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300875/*[clinic input]
876_sre.SRE_Pattern.finditer
877
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100878 cls: defining_class
879 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300880 string: object
881 pos: Py_ssize_t = 0
882 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
883
884Return an iterator over all non-overlapping matches for the RE pattern in string.
885
886For each match, the iterator returns a match object.
887[clinic start generated code]*/
888
889static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100890_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
891 PyObject *string, Py_ssize_t pos,
892 Py_ssize_t endpos)
893/*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
Fredrik Lundh703ce812001-10-24 22:16:30 +0000894{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100895 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000896 PyObject* scanner;
897 PyObject* search;
898 PyObject* iterator;
899
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100900 scanner = pattern_scanner(module_state, self, string, pos, endpos);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000901 if (!scanner)
902 return NULL;
903
904 search = PyObject_GetAttrString(scanner, "search");
905 Py_DECREF(scanner);
906 if (!search)
907 return NULL;
908
909 iterator = PyCallIter_New(search, Py_None);
910 Py_DECREF(search);
911
912 return iterator;
913}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000914
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300915/*[clinic input]
916_sre.SRE_Pattern.scanner
917
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100918 cls: defining_class
919 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300920 string: object
921 pos: Py_ssize_t = 0
922 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
923
924[clinic start generated code]*/
925
926static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100927_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
928 PyObject *string, Py_ssize_t pos,
929 Py_ssize_t endpos)
930/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300931{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100932 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
933
934 return pattern_scanner(module_state, self, string, pos, endpos);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300935}
936
937/*[clinic input]
938_sre.SRE_Pattern.split
939
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200940 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300941 maxsplit: Py_ssize_t = 0
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300942
943Split string by the occurrences of pattern.
944[clinic start generated code]*/
945
946static PyObject *
947_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200948 Py_ssize_t maxsplit)
949/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000950{
951 SRE_STATE state;
952 PyObject* list;
953 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100954 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000955 Py_ssize_t n;
956 Py_ssize_t i;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300957 const void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000958
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200959 assert(self->codesize != 0);
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200960
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300961 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000962 return NULL;
963
964 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000965 if (!list) {
966 state_fini(&state);
967 return NULL;
968 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000969
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000970 n = 0;
971 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000972
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000973 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000974
975 state_reset(&state);
976
977 state.ptr = state.start;
978
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300979 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300980 if (PyErr_Occurred())
981 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000982
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000983 if (status <= 0) {
984 if (status == 0)
985 break;
986 pattern_error(status);
987 goto error;
988 }
Tim Peters3d563502006-01-21 02:47:53 +0000989
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000990 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300991 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000992 string, STATE_OFFSET(&state, last),
993 STATE_OFFSET(&state, state.start)
994 );
995 if (!item)
996 goto error;
997 status = PyList_Append(list, item);
998 Py_DECREF(item);
999 if (status < 0)
1000 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001001
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001002 /* add groups (if any) */
1003 for (i = 0; i < self->groups; i++) {
1004 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001005 if (!item)
1006 goto error;
1007 status = PyList_Append(list, item);
1008 Py_DECREF(item);
1009 if (status < 0)
1010 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001011 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001012
1013 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001014 state.must_advance = (state.ptr == state.start);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001015 last = state.start = state.ptr;
1016
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001017 }
1018
Fredrik Lundhf864aa82001-10-22 06:01:56 +00001019 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001020 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +00001021 string, STATE_OFFSET(&state, last), state.endpos
1022 );
1023 if (!item)
1024 goto error;
1025 status = PyList_Append(list, item);
1026 Py_DECREF(item);
1027 if (status < 0)
1028 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001029
1030 state_fini(&state);
1031 return list;
1032
1033error:
1034 Py_DECREF(list);
1035 state_fini(&state);
1036 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001037
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001038}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001039
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001040static PyObject*
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001041pattern_subx(_sremodulestate* module_state,
1042 PatternObject* self,
1043 PyObject* ptemplate,
1044 PyObject* string,
1045 Py_ssize_t count,
1046 Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001047{
1048 SRE_STATE state;
1049 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001050 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001051 PyObject* item;
1052 PyObject* filter;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001053 PyObject* match;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001054 const void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01001055 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001056 Py_ssize_t n;
1057 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001058 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001059 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001060 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001061
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001062 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001063 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001064 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001065 Py_INCREF(filter);
1066 filter_is_callable = 1;
1067 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001068 /* if not callable, check if it's a literal string */
1069 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001070 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001071 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001072 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001073 if (charsize == 1)
1074 literal = memchr(ptr, '\\', n) == NULL;
1075 else
1076 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001077 } else {
1078 PyErr_Clear();
1079 literal = 0;
1080 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06001081 if (view.buf)
1082 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001083 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001084 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001085 Py_INCREF(filter);
1086 filter_is_callable = 0;
1087 } else {
1088 /* not a literal; hand it over to the template compiler */
1089 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001090 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001091 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001092 );
1093 if (!filter)
1094 return NULL;
1095 filter_is_callable = PyCallable_Check(filter);
1096 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00001097 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001098
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001099 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001100 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001101 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001102 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001103
1104 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001105 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001106 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001107 state_fini(&state);
1108 return NULL;
1109 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001110
1111 n = i = 0;
1112
1113 while (!count || n < count) {
1114
1115 state_reset(&state);
1116
1117 state.ptr = state.start;
1118
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001119 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001120 if (PyErr_Occurred())
1121 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001122
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001123 if (status <= 0) {
1124 if (status == 0)
1125 break;
1126 pattern_error(status);
1127 goto error;
1128 }
Tim Peters3d563502006-01-21 02:47:53 +00001129
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001130 b = STATE_OFFSET(&state, state.start);
1131 e = STATE_OFFSET(&state, state.ptr);
1132
1133 if (i < b) {
1134 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001135 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001136 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001137 if (!item)
1138 goto error;
1139 status = PyList_Append(list, item);
1140 Py_DECREF(item);
1141 if (status < 0)
1142 goto error;
1143
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001144 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001145
1146 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001147 /* pass match object through filter */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001148 match = pattern_new_match(module_state, self, &state, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001149 if (!match)
1150 goto error;
Petr Viktorinffd97532020-02-11 17:46:57 +01001151 item = PyObject_CallOneArg(filter, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001152 Py_DECREF(match);
1153 if (!item)
1154 goto error;
1155 } else {
1156 /* filter is literal string */
1157 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001158 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001159 }
1160
1161 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001162 if (item != Py_None) {
1163 status = PyList_Append(list, item);
1164 Py_DECREF(item);
1165 if (status < 0)
1166 goto error;
1167 }
Tim Peters3d563502006-01-21 02:47:53 +00001168
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001169 i = e;
1170 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001171 state.must_advance = (state.ptr == state.start);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001172 state.start = state.ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001173 }
1174
1175 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001176 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001177 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001178 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001179 if (!item)
1180 goto error;
1181 status = PyList_Append(list, item);
1182 Py_DECREF(item);
1183 if (status < 0)
1184 goto error;
1185 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001186
1187 state_fini(&state);
1188
Guido van Rossum4e173842001-12-07 04:25:10 +00001189 Py_DECREF(filter);
1190
Fredrik Lundhdac58492001-10-21 21:48:30 +00001191 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001192 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001193 if (!joiner) {
1194 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001195 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001196 }
1197 if (PyList_GET_SIZE(list) == 0) {
1198 Py_DECREF(list);
1199 item = joiner;
1200 }
1201 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001202 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001203 item = _PyBytes_Join(joiner, list);
1204 else
1205 item = PyUnicode_Join(joiner, list);
1206 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001207 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001208 if (!item)
1209 return NULL;
1210 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001211
1212 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001213 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001214
1215 return item;
1216
1217error:
1218 Py_DECREF(list);
1219 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001220 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001221 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001222
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001223}
1224
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001225/*[clinic input]
1226_sre.SRE_Pattern.sub
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001227
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001228 cls: defining_class
1229 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001230 repl: object
1231 string: object
1232 count: Py_ssize_t = 0
1233
1234Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1235[clinic start generated code]*/
1236
1237static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001238_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1239 PyObject *repl, PyObject *string, Py_ssize_t count)
1240/*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001241{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001242 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1243
1244 return pattern_subx(module_state, self, repl, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001245}
1246
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001247/*[clinic input]
1248_sre.SRE_Pattern.subn
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001249
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001250 cls: defining_class
1251 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001252 repl: object
1253 string: object
1254 count: Py_ssize_t = 0
1255
1256Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1257[clinic start generated code]*/
1258
1259static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001260_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1261 PyObject *repl, PyObject *string,
1262 Py_ssize_t count)
1263/*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001264{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001265 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1266
1267 return pattern_subx(module_state, self, repl, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001268}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001269
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001270/*[clinic input]
1271_sre.SRE_Pattern.__copy__
1272
1273[clinic start generated code]*/
1274
1275static PyObject *
1276_sre_SRE_Pattern___copy___impl(PatternObject *self)
1277/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001278{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001279 Py_INCREF(self);
1280 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001281}
1282
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001283/*[clinic input]
1284_sre.SRE_Pattern.__deepcopy__
1285
1286 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001287 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001288
1289[clinic start generated code]*/
1290
1291static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001292_sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1293/*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001294{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001295 Py_INCREF(self);
1296 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001297}
1298
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001299static PyObject *
1300pattern_repr(PatternObject *obj)
1301{
1302 static const struct {
1303 const char *name;
1304 int value;
1305 } flag_names[] = {
1306 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1307 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1308 {"re.LOCALE", SRE_FLAG_LOCALE},
1309 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1310 {"re.DOTALL", SRE_FLAG_DOTALL},
1311 {"re.UNICODE", SRE_FLAG_UNICODE},
1312 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1313 {"re.DEBUG", SRE_FLAG_DEBUG},
1314 {"re.ASCII", SRE_FLAG_ASCII},
1315 };
1316 PyObject *result = NULL;
1317 PyObject *flag_items;
Victor Stinner706768c2014-08-16 01:03:39 +02001318 size_t i;
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001319 int flags = obj->flags;
1320
1321 /* Omit re.UNICODE for valid string patterns. */
1322 if (obj->isbytes == 0 &&
1323 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1324 SRE_FLAG_UNICODE)
1325 flags &= ~SRE_FLAG_UNICODE;
1326
1327 flag_items = PyList_New(0);
1328 if (!flag_items)
1329 return NULL;
1330
1331 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1332 if (flags & flag_names[i].value) {
1333 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1334 if (!item)
1335 goto done;
1336
1337 if (PyList_Append(flag_items, item) < 0) {
1338 Py_DECREF(item);
1339 goto done;
1340 }
1341 Py_DECREF(item);
1342 flags &= ~flag_names[i].value;
1343 }
1344 }
1345 if (flags) {
1346 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1347 if (!item)
1348 goto done;
1349
1350 if (PyList_Append(flag_items, item) < 0) {
1351 Py_DECREF(item);
1352 goto done;
1353 }
1354 Py_DECREF(item);
1355 }
1356
1357 if (PyList_Size(flag_items) > 0) {
1358 PyObject *flags_result;
1359 PyObject *sep = PyUnicode_FromString("|");
1360 if (!sep)
1361 goto done;
1362 flags_result = PyUnicode_Join(sep, flag_items);
1363 Py_DECREF(sep);
1364 if (!flags_result)
1365 goto done;
1366 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1367 obj->pattern, flags_result);
1368 Py_DECREF(flags_result);
1369 }
1370 else {
1371 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1372 }
1373
1374done:
1375 Py_DECREF(flag_items);
1376 return result;
1377}
1378
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001379PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001380
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001381/* PatternObject's 'groupindex' method. */
1382static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02001383pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001384{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001385 if (self->groupindex == NULL)
1386 return PyDict_New();
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001387 return PyDictProxy_New(self->groupindex);
1388}
1389
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001390static int _validate(PatternObject *self); /* Forward */
1391
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001392/*[clinic input]
1393_sre.compile
1394
1395 pattern: object
1396 flags: int
1397 code: object(subclass_of='&PyList_Type')
1398 groups: Py_ssize_t
Victor Stinner726a57d2016-11-22 23:04:39 +01001399 groupindex: object(subclass_of='&PyDict_Type')
1400 indexgroup: object(subclass_of='&PyTuple_Type')
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001401
1402[clinic start generated code]*/
1403
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404static PyObject *
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +03001405_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001406 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1407 PyObject *indexgroup)
Victor Stinner726a57d2016-11-22 23:04:39 +01001408/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001409{
1410 /* "compile" pattern descriptor to pattern object */
1411
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001412 _sremodulestate *module_state = get_sre_module_state(module);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001413 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001414 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001415
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001416 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001417 /* coverity[ampersand_in_size] */
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07001418 self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419 if (!self)
1420 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001421 self->weakreflist = NULL;
1422 self->pattern = NULL;
1423 self->groupindex = NULL;
1424 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001425
1426 self->codesize = n;
1427
1428 for (i = 0; i < n; i++) {
1429 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001430 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001431 self->code[i] = (SRE_CODE) value;
1432 if ((unsigned long) self->code[i] != value) {
1433 PyErr_SetString(PyExc_OverflowError,
1434 "regular expression code size limit exceeded");
1435 break;
1436 }
1437 }
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07001438 PyObject_GC_Track(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001439
1440 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001441 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001442 return NULL;
1443 }
1444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001446 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 else {
1449 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001450 int charsize;
1451 Py_buffer view;
1452 view.buf = NULL;
1453 if (!getstring(pattern, &p_length, &self->isbytes,
1454 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 Py_DECREF(self);
1456 return NULL;
1457 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001458 if (view.buf)
1459 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462 Py_INCREF(pattern);
1463 self->pattern = pattern;
1464
1465 self->flags = flags;
1466
1467 self->groups = groups;
1468
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001469 if (PyDict_GET_SIZE(groupindex) > 0) {
1470 Py_INCREF(groupindex);
1471 self->groupindex = groupindex;
1472 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1473 Py_INCREF(indexgroup);
1474 self->indexgroup = indexgroup;
1475 }
1476 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001477
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001478 if (!_validate(self)) {
1479 Py_DECREF(self);
1480 return NULL;
1481 }
1482
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001483 return (PyObject*) self;
1484}
1485
Guido van Rossumb700df92000-03-31 14:59:30 +00001486/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001487/* Code validation */
1488
1489/* To learn more about this code, have a look at the _compile() function in
1490 Lib/sre_compile.py. The validation functions below checks the code array
1491 for conformance with the code patterns generated there.
1492
1493 The nice thing about the generated code is that it is position-independent:
1494 all jumps are relative jumps forward. Also, jumps don't cross each other:
1495 the target of a later jump is always earlier than the target of an earlier
1496 jump. IOW, this is okay:
1497
1498 J---------J-------T--------T
1499 \ \_____/ /
1500 \______________________/
1501
1502 but this is not:
1503
1504 J---------J-------T--------T
1505 \_________\_____/ /
1506 \____________/
1507
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001508 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001509*/
1510
1511/* Defining this one enables tracing of the validator */
1512#undef VVERBOSE
1513
1514/* Trace macro for the validator */
1515#if defined(VVERBOSE)
1516#define VTRACE(v) printf v
1517#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001518#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001519#endif
1520
1521/* Report failure */
1522#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1523
1524/* Extract opcode, argument, or skip count from code array */
1525#define GET_OP \
1526 do { \
1527 VTRACE(("%p: ", code)); \
1528 if (code >= end) FAIL; \
1529 op = *code++; \
1530 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1531 } while (0)
1532#define GET_ARG \
1533 do { \
1534 VTRACE(("%p= ", code)); \
1535 if (code >= end) FAIL; \
1536 arg = *code++; \
1537 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1538 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001539#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001540 do { \
1541 VTRACE(("%p= ", code)); \
1542 if (code >= end) FAIL; \
1543 skip = *code; \
1544 VTRACE(("%lu (skip to %p)\n", \
1545 (unsigned long)skip, code+skip)); \
Benjamin Petersonca470632016-09-06 13:47:26 -07001546 if (skip-adj > (uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001547 FAIL; \
1548 code++; \
1549 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001550#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001551
1552static int
1553_validate_charset(SRE_CODE *code, SRE_CODE *end)
1554{
1555 /* Some variables are manipulated by the macros above */
1556 SRE_CODE op;
1557 SRE_CODE arg;
1558 SRE_CODE offset;
1559 int i;
1560
1561 while (code < end) {
1562 GET_OP;
1563 switch (op) {
1564
1565 case SRE_OP_NEGATE:
1566 break;
1567
1568 case SRE_OP_LITERAL:
1569 GET_ARG;
1570 break;
1571
1572 case SRE_OP_RANGE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001573 case SRE_OP_RANGE_UNI_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001574 GET_ARG;
1575 GET_ARG;
1576 break;
1577
1578 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001579 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Benjamin Petersonca470632016-09-06 13:47:26 -07001580 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001581 FAIL;
1582 code += offset;
1583 break;
1584
1585 case SRE_OP_BIGCHARSET:
1586 GET_ARG; /* Number of blocks */
1587 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001588 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001589 FAIL;
1590 /* Make sure that each byte points to a valid block */
1591 for (i = 0; i < 256; i++) {
1592 if (((unsigned char *)code)[i] >= arg)
1593 FAIL;
1594 }
1595 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001596 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Benjamin Petersonca470632016-09-06 13:47:26 -07001597 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001598 FAIL;
1599 code += offset;
1600 break;
1601
1602 case SRE_OP_CATEGORY:
1603 GET_ARG;
1604 switch (arg) {
1605 case SRE_CATEGORY_DIGIT:
1606 case SRE_CATEGORY_NOT_DIGIT:
1607 case SRE_CATEGORY_SPACE:
1608 case SRE_CATEGORY_NOT_SPACE:
1609 case SRE_CATEGORY_WORD:
1610 case SRE_CATEGORY_NOT_WORD:
1611 case SRE_CATEGORY_LINEBREAK:
1612 case SRE_CATEGORY_NOT_LINEBREAK:
1613 case SRE_CATEGORY_LOC_WORD:
1614 case SRE_CATEGORY_LOC_NOT_WORD:
1615 case SRE_CATEGORY_UNI_DIGIT:
1616 case SRE_CATEGORY_UNI_NOT_DIGIT:
1617 case SRE_CATEGORY_UNI_SPACE:
1618 case SRE_CATEGORY_UNI_NOT_SPACE:
1619 case SRE_CATEGORY_UNI_WORD:
1620 case SRE_CATEGORY_UNI_NOT_WORD:
1621 case SRE_CATEGORY_UNI_LINEBREAK:
1622 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1623 break;
1624 default:
1625 FAIL;
1626 }
1627 break;
1628
1629 default:
1630 FAIL;
1631
1632 }
1633 }
1634
1635 return 1;
1636}
1637
1638static int
1639_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1640{
1641 /* Some variables are manipulated by the macros above */
1642 SRE_CODE op;
1643 SRE_CODE arg;
1644 SRE_CODE skip;
1645
1646 VTRACE(("code=%p, end=%p\n", code, end));
1647
1648 if (code > end)
1649 FAIL;
1650
1651 while (code < end) {
1652 GET_OP;
1653 switch (op) {
1654
1655 case SRE_OP_MARK:
1656 /* We don't check whether marks are properly nested; the
1657 sre_match() code is robust even if they don't, and the worst
1658 you can get is nonsensical match results. */
1659 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001660 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001661 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1662 FAIL;
1663 }
1664 break;
1665
1666 case SRE_OP_LITERAL:
1667 case SRE_OP_NOT_LITERAL:
1668 case SRE_OP_LITERAL_IGNORE:
1669 case SRE_OP_NOT_LITERAL_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001670 case SRE_OP_LITERAL_UNI_IGNORE:
1671 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001672 case SRE_OP_LITERAL_LOC_IGNORE:
1673 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001674 GET_ARG;
1675 /* The arg is just a character, nothing to check */
1676 break;
1677
1678 case SRE_OP_SUCCESS:
1679 case SRE_OP_FAILURE:
1680 /* Nothing to check; these normally end the matching process */
1681 break;
1682
1683 case SRE_OP_AT:
1684 GET_ARG;
1685 switch (arg) {
1686 case SRE_AT_BEGINNING:
1687 case SRE_AT_BEGINNING_STRING:
1688 case SRE_AT_BEGINNING_LINE:
1689 case SRE_AT_END:
1690 case SRE_AT_END_LINE:
1691 case SRE_AT_END_STRING:
1692 case SRE_AT_BOUNDARY:
1693 case SRE_AT_NON_BOUNDARY:
1694 case SRE_AT_LOC_BOUNDARY:
1695 case SRE_AT_LOC_NON_BOUNDARY:
1696 case SRE_AT_UNI_BOUNDARY:
1697 case SRE_AT_UNI_NON_BOUNDARY:
1698 break;
1699 default:
1700 FAIL;
1701 }
1702 break;
1703
1704 case SRE_OP_ANY:
1705 case SRE_OP_ANY_ALL:
1706 /* These have no operands */
1707 break;
1708
1709 case SRE_OP_IN:
1710 case SRE_OP_IN_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001711 case SRE_OP_IN_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001712 case SRE_OP_IN_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001713 GET_SKIP;
1714 /* Stop 1 before the end; we check the FAILURE below */
1715 if (!_validate_charset(code, code+skip-2))
1716 FAIL;
1717 if (code[skip-2] != SRE_OP_FAILURE)
1718 FAIL;
1719 code += skip-1;
1720 break;
1721
1722 case SRE_OP_INFO:
1723 {
1724 /* A minimal info field is
1725 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1726 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1727 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001728 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001729 SRE_CODE *newcode;
1730 GET_SKIP;
1731 newcode = code+skip-1;
1732 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001733 GET_ARG;
1734 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001735 /* Check that only valid flags are present */
1736 if ((flags & ~(SRE_INFO_PREFIX |
1737 SRE_INFO_LITERAL |
1738 SRE_INFO_CHARSET)) != 0)
1739 FAIL;
1740 /* PREFIX and CHARSET are mutually exclusive */
1741 if ((flags & SRE_INFO_PREFIX) &&
1742 (flags & SRE_INFO_CHARSET))
1743 FAIL;
1744 /* LITERAL implies PREFIX */
1745 if ((flags & SRE_INFO_LITERAL) &&
1746 !(flags & SRE_INFO_PREFIX))
1747 FAIL;
1748 /* Validate the prefix */
1749 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001750 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001751 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001752 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001753 /* Here comes the prefix string */
Benjamin Petersonca470632016-09-06 13:47:26 -07001754 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001755 FAIL;
1756 code += prefix_len;
1757 /* And here comes the overlap table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001758 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001759 FAIL;
1760 /* Each overlap value should be < prefix_len */
1761 for (i = 0; i < prefix_len; i++) {
1762 if (code[i] >= prefix_len)
1763 FAIL;
1764 }
1765 code += prefix_len;
1766 }
1767 /* Validate the charset */
1768 if (flags & SRE_INFO_CHARSET) {
1769 if (!_validate_charset(code, newcode-1))
1770 FAIL;
1771 if (newcode[-1] != SRE_OP_FAILURE)
1772 FAIL;
1773 code = newcode;
1774 }
1775 else if (code != newcode) {
1776 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1777 FAIL;
1778 }
1779 }
1780 break;
1781
1782 case SRE_OP_BRANCH:
1783 {
1784 SRE_CODE *target = NULL;
1785 for (;;) {
1786 GET_SKIP;
1787 if (skip == 0)
1788 break;
1789 /* Stop 2 before the end; we check the JUMP below */
1790 if (!_validate_inner(code, code+skip-3, groups))
1791 FAIL;
1792 code += skip-3;
1793 /* Check that it ends with a JUMP, and that each JUMP
1794 has the same target */
1795 GET_OP;
1796 if (op != SRE_OP_JUMP)
1797 FAIL;
1798 GET_SKIP;
1799 if (target == NULL)
1800 target = code+skip-1;
1801 else if (code+skip-1 != target)
1802 FAIL;
1803 }
1804 }
1805 break;
1806
1807 case SRE_OP_REPEAT_ONE:
1808 case SRE_OP_MIN_REPEAT_ONE:
1809 {
1810 SRE_CODE min, max;
1811 GET_SKIP;
1812 GET_ARG; min = arg;
1813 GET_ARG; max = arg;
1814 if (min > max)
1815 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001816 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001817 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001818 if (!_validate_inner(code, code+skip-4, groups))
1819 FAIL;
1820 code += skip-4;
1821 GET_OP;
1822 if (op != SRE_OP_SUCCESS)
1823 FAIL;
1824 }
1825 break;
1826
1827 case SRE_OP_REPEAT:
1828 {
1829 SRE_CODE min, max;
1830 GET_SKIP;
1831 GET_ARG; min = arg;
1832 GET_ARG; max = arg;
1833 if (min > max)
1834 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001835 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001836 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001837 if (!_validate_inner(code, code+skip-3, groups))
1838 FAIL;
1839 code += skip-3;
1840 GET_OP;
1841 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1842 FAIL;
1843 }
1844 break;
1845
1846 case SRE_OP_GROUPREF:
1847 case SRE_OP_GROUPREF_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001848 case SRE_OP_GROUPREF_UNI_IGNORE:
1849 case SRE_OP_GROUPREF_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001850 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001851 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001852 FAIL;
1853 break;
1854
1855 case SRE_OP_GROUPREF_EXISTS:
1856 /* The regex syntax for this is: '(?(group)then|else)', where
1857 'group' is either an integer group number or a group name,
1858 'then' and 'else' are sub-regexes, and 'else' is optional. */
1859 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001860 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001861 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001862 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001863 code--; /* The skip is relative to the first arg! */
1864 /* There are two possibilities here: if there is both a 'then'
1865 part and an 'else' part, the generated code looks like:
1866
1867 GROUPREF_EXISTS
1868 <group>
1869 <skipyes>
1870 ...then part...
1871 JUMP
1872 <skipno>
1873 (<skipyes> jumps here)
1874 ...else part...
1875 (<skipno> jumps here)
1876
1877 If there is only a 'then' part, it looks like:
1878
1879 GROUPREF_EXISTS
1880 <group>
1881 <skip>
1882 ...then part...
1883 (<skip> jumps here)
1884
1885 There is no direct way to decide which it is, and we don't want
1886 to allow arbitrary jumps anywhere in the code; so we just look
1887 for a JUMP opcode preceding our skip target.
1888 */
Benjamin Petersonca470632016-09-06 13:47:26 -07001889 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001890 code[skip-3] == SRE_OP_JUMP)
1891 {
1892 VTRACE(("both then and else parts present\n"));
1893 if (!_validate_inner(code+1, code+skip-3, groups))
1894 FAIL;
1895 code += skip-2; /* Position after JUMP, at <skipno> */
1896 GET_SKIP;
1897 if (!_validate_inner(code, code+skip-1, groups))
1898 FAIL;
1899 code += skip-1;
1900 }
1901 else {
1902 VTRACE(("only a then part present\n"));
1903 if (!_validate_inner(code+1, code+skip-1, groups))
1904 FAIL;
1905 code += skip-1;
1906 }
1907 break;
1908
1909 case SRE_OP_ASSERT:
1910 case SRE_OP_ASSERT_NOT:
1911 GET_SKIP;
1912 GET_ARG; /* 0 for lookahead, width for lookbehind */
1913 code--; /* Back up over arg to simplify math below */
1914 if (arg & 0x80000000)
1915 FAIL; /* Width too large */
1916 /* Stop 1 before the end; we check the SUCCESS below */
1917 if (!_validate_inner(code+1, code+skip-2, groups))
1918 FAIL;
1919 code += skip-2;
1920 GET_OP;
1921 if (op != SRE_OP_SUCCESS)
1922 FAIL;
1923 break;
1924
1925 default:
1926 FAIL;
1927
1928 }
1929 }
1930
1931 VTRACE(("okay\n"));
1932 return 1;
1933}
1934
1935static int
1936_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1937{
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001938 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1939 code >= end || end[-1] != SRE_OP_SUCCESS)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001940 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001941 return _validate_inner(code, end-1, groups);
1942}
1943
1944static int
1945_validate(PatternObject *self)
1946{
1947 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1948 {
1949 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1950 return 0;
1951 }
1952 else
1953 VTRACE(("Success!\n"));
1954 return 1;
1955}
1956
1957/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001958/* match methods */
1959
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07001960static int
1961match_traverse(MatchObject *self, visitproc visit, void *arg)
1962{
1963 Py_VISIT(Py_TYPE(self));
1964 Py_VISIT(self->string);
1965 Py_VISIT(self->regs);
1966 Py_VISIT(self->pattern);
1967 return 0;
1968}
1969
1970static int
1971match_clear(MatchObject *self)
1972{
1973 Py_CLEAR(self->string);
1974 Py_CLEAR(self->regs);
1975 Py_CLEAR(self->pattern);
1976 return 0;
1977}
1978
Guido van Rossumb700df92000-03-31 14:59:30 +00001979static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001980match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001981{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001982 PyTypeObject *tp = Py_TYPE(self);
1983
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07001984 PyObject_GC_UnTrack(self);
1985 (void)match_clear(self);
1986 tp->tp_free(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001987 Py_DECREF(tp);
Guido van Rossumb700df92000-03-31 14:59:30 +00001988}
1989
1990static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001991match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001992{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001993 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001994 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001995 Py_buffer view;
1996 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001997 const void* ptr;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001998 Py_ssize_t i, j;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001999
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002000 assert(0 <= index && index < self->groups);
Fredrik Lundh6f013982000-07-03 18:44:21 +00002001 index *= 2;
2002
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002003 if (self->string == Py_None || self->mark[index] < 0) {
2004 /* return default value if the string or group is undefined */
2005 Py_INCREF(def);
2006 return def;
2007 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002008
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002009 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03002010 if (ptr == NULL)
2011 return NULL;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02002012
2013 i = self->mark[index];
2014 j = self->mark[index+1];
2015 i = Py_MIN(i, length);
2016 j = Py_MIN(j, length);
2017 result = getslice(isbytes, ptr, self->string, i, j);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002018 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03002019 PyBuffer_Release(&view);
2020 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002021}
2022
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002023static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002024match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00002025{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002026 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00002027
Guido van Rossumddefaf32007-01-14 03:31:43 +00002028 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002029 /* Default value */
2030 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00002031
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03002032 if (PyIndex_Check(index)) {
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002033 i = PyNumber_AsSsize_t(index, NULL);
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03002034 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002035 else {
2036 i = -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00002037
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002038 if (self->pattern->groupindex) {
2039 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2040 if (index && PyLong_Check(index)) {
2041 i = PyLong_AsSsize_t(index);
2042 }
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002043 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002044 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002045 if (i < 0 || i >= self->groups) {
2046 /* raise IndexError if we were given a bad group number */
2047 if (!PyErr_Occurred()) {
2048 PyErr_SetString(PyExc_IndexError, "no such group");
2049 }
2050 return -1;
2051 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002052
2053 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002054}
2055
2056static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002057match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002058{
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002059 Py_ssize_t i = match_getindex(self, index);
2060
2061 if (i < 0) {
2062 return NULL;
2063 }
2064
2065 return match_getslice_by_index(self, i, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002066}
2067
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002068/*[clinic input]
2069_sre.SRE_Match.expand
2070
2071 template: object
2072
2073Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2074[clinic start generated code]*/
2075
2076static PyObject *
2077_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2078/*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002079{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002080 /* delegate to Python code */
2081 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002082 SRE_PY_MODULE, "_expand",
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002083 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002084 );
2085}
2086
2087static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002088match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002089{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002090 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002091 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002092
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002093 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002094
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002095 switch (size) {
2096 case 0:
Victor Stinner37834132020-10-27 17:12:53 +01002097 result = match_getslice(self, _PyLong_GetZero(), Py_None);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002098 break;
2099 case 1:
2100 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2101 break;
2102 default:
2103 /* fetch multiple items */
2104 result = PyTuple_New(size);
2105 if (!result)
2106 return NULL;
2107 for (i = 0; i < size; i++) {
2108 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002109 self, PyTuple_GET_ITEM(args, i), Py_None
2110 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002111 if (!item) {
2112 Py_DECREF(result);
2113 return NULL;
2114 }
2115 PyTuple_SET_ITEM(result, i, item);
2116 }
2117 break;
2118 }
2119 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002120}
2121
Eric V. Smith605bdae2016-09-11 08:55:43 -04002122static PyObject*
2123match_getitem(MatchObject* self, PyObject* name)
2124{
2125 return match_getslice(self, name, Py_None);
2126}
2127
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002128/*[clinic input]
2129_sre.SRE_Match.groups
2130
2131 default: object = None
2132 Is used for groups that did not participate in the match.
2133
2134Return a tuple containing all the subgroups of the match, from 1.
2135[clinic start generated code]*/
2136
2137static PyObject *
2138_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2139/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002140{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002141 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002142 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002143
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002144 result = PyTuple_New(self->groups-1);
2145 if (!result)
2146 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002147
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002148 for (index = 1; index < self->groups; index++) {
2149 PyObject* item;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002150 item = match_getslice_by_index(self, index, default_value);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002151 if (!item) {
2152 Py_DECREF(result);
2153 return NULL;
2154 }
2155 PyTuple_SET_ITEM(result, index-1, item);
2156 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002157
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002158 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002159}
2160
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002161/*[clinic input]
2162_sre.SRE_Match.groupdict
2163
2164 default: object = None
2165 Is used for groups that did not participate in the match.
2166
2167Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2168[clinic start generated code]*/
2169
2170static PyObject *
2171_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2172/*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002173{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002174 PyObject *result;
2175 PyObject *key;
2176 PyObject *value;
2177 Py_ssize_t pos = 0;
2178 Py_hash_t hash;
Guido van Rossumb700df92000-03-31 14:59:30 +00002179
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002180 result = PyDict_New();
2181 if (!result || !self->pattern->groupindex)
2182 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002183
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002184 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002185 int status;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002186 Py_INCREF(key);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002187 value = match_getslice(self, key, default_value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002188 if (!value) {
2189 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002190 goto failed;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002191 }
2192 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002193 Py_DECREF(value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002194 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002195 if (status < 0)
2196 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002197 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002199 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002200
2201failed:
Fredrik Lundh770617b2001-01-14 15:06:11 +00002202 Py_DECREF(result);
2203 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002204}
2205
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002206/*[clinic input]
2207_sre.SRE_Match.start -> Py_ssize_t
2208
2209 group: object(c_default="NULL") = 0
2210 /
2211
2212Return index of the start of the substring matched by group.
2213[clinic start generated code]*/
2214
2215static Py_ssize_t
2216_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2217/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002218{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002219 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002220
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002221 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002222 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002223 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002224
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002225 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002226 return self->mark[index*2];
Guido van Rossumb700df92000-03-31 14:59:30 +00002227}
2228
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002229/*[clinic input]
2230_sre.SRE_Match.end -> Py_ssize_t
2231
2232 group: object(c_default="NULL") = 0
2233 /
2234
2235Return index of the end of the substring matched by group.
2236[clinic start generated code]*/
2237
2238static Py_ssize_t
2239_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2240/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002241{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002242 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002243
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002244 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002245 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002246 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002247
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002248 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002249 return self->mark[index*2+1];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002250}
2251
2252LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002253_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002254{
2255 PyObject* pair;
2256 PyObject* item;
2257
2258 pair = PyTuple_New(2);
2259 if (!pair)
2260 return NULL;
2261
Christian Heimes217cfd12007-12-02 14:31:20 +00002262 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002263 if (!item)
2264 goto error;
2265 PyTuple_SET_ITEM(pair, 0, item);
2266
Christian Heimes217cfd12007-12-02 14:31:20 +00002267 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002268 if (!item)
2269 goto error;
2270 PyTuple_SET_ITEM(pair, 1, item);
2271
2272 return pair;
2273
2274 error:
2275 Py_DECREF(pair);
2276 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002277}
2278
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002279/*[clinic input]
2280_sre.SRE_Match.span
2281
2282 group: object(c_default="NULL") = 0
2283 /
2284
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002285For match object m, return the 2-tuple (m.start(group), m.end(group)).
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002286[clinic start generated code]*/
2287
2288static PyObject *
2289_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002290/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002291{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002292 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002293
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002294 if (index < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002295 return NULL;
2296 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002297
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002298 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002299 return _pair(self->mark[index*2], self->mark[index*2+1]);
2300}
2301
2302static PyObject*
2303match_regs(MatchObject* self)
2304{
2305 PyObject* regs;
2306 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002307 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002308
2309 regs = PyTuple_New(self->groups);
2310 if (!regs)
2311 return NULL;
2312
2313 for (index = 0; index < self->groups; index++) {
2314 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2315 if (!item) {
2316 Py_DECREF(regs);
2317 return NULL;
2318 }
2319 PyTuple_SET_ITEM(regs, index, item);
2320 }
2321
2322 Py_INCREF(regs);
2323 self->regs = regs;
2324
2325 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002326}
2327
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002328/*[clinic input]
2329_sre.SRE_Match.__copy__
2330
2331[clinic start generated code]*/
2332
2333static PyObject *
2334_sre_SRE_Match___copy___impl(MatchObject *self)
2335/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002336{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002337 Py_INCREF(self);
2338 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002339}
2340
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002341/*[clinic input]
2342_sre.SRE_Match.__deepcopy__
2343
2344 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002345 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002346
2347[clinic start generated code]*/
2348
2349static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002350_sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2351/*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002352{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002353 Py_INCREF(self);
2354 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002355}
2356
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002357PyDoc_STRVAR(match_doc,
2358"The result of re.match() and re.search().\n\
2359Match objects always have a boolean value of True.");
2360
2361PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002362"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002363 Return subgroup(s) of the match by indices or names.\n\
2364 For 0 returns the entire match.");
2365
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002366static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002367match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
Guido van Rossumb700df92000-03-31 14:59:30 +00002368{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002369 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002370 return PyLong_FromSsize_t(self->lastindex);
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002371 Py_RETURN_NONE;
Guido van Rossumb700df92000-03-31 14:59:30 +00002372}
2373
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002374static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002375match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002376{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002377 if (self->pattern->indexgroup &&
2378 self->lastindex >= 0 &&
2379 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2380 {
2381 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2382 self->lastindex);
2383 Py_INCREF(result);
2384 return result;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002385 }
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002386 Py_RETURN_NONE;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002387}
2388
2389static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002390match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002391{
2392 if (self->regs) {
2393 Py_INCREF(self->regs);
2394 return self->regs;
2395 } else
2396 return match_regs(self);
2397}
2398
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002399static PyObject *
2400match_repr(MatchObject *self)
2401{
2402 PyObject *result;
2403 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2404 if (group0 == NULL)
2405 return NULL;
2406 result = PyUnicode_FromFormat(
sth8b91eda2019-03-10 11:29:14 +01002407 "<%s object; span=(%zd, %zd), match=%.50R>",
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002408 Py_TYPE(self)->tp_name,
2409 self->mark[0], self->mark[1], group0);
2410 Py_DECREF(group0);
2411 return result;
2412}
2413
2414
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002415static PyObject*
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002416pattern_new_match(_sremodulestate* module_state,
2417 PatternObject* pattern,
2418 SRE_STATE* state,
2419 Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002420{
2421 /* create match object (from state object) */
2422
2423 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002424 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002425 char* base;
2426 int n;
2427
2428 if (status > 0) {
2429
2430 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002431 /* coverity[ampersand_in_size] */
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002432 match = PyObject_GC_NewVar(MatchObject,
2433 module_state->Match_Type,
2434 2*(pattern->groups+1));
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002435 if (!match)
2436 return NULL;
2437
2438 Py_INCREF(pattern);
2439 match->pattern = pattern;
2440
2441 Py_INCREF(state->string);
2442 match->string = state->string;
2443
2444 match->regs = NULL;
2445 match->groups = pattern->groups+1;
2446
2447 /* fill in group slices */
2448
2449 base = (char*) state->beginning;
2450 n = state->charsize;
2451
2452 match->mark[0] = ((char*) state->start - base) / n;
2453 match->mark[1] = ((char*) state->ptr - base) / n;
2454
2455 for (i = j = 0; i < pattern->groups; i++, j+=2)
2456 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2457 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2458 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2459 } else
2460 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2461
2462 match->pos = state->pos;
2463 match->endpos = state->endpos;
2464
2465 match->lastindex = state->lastindex;
2466
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002467 PyObject_GC_Track(match);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002468 return (PyObject*) match;
2469
2470 } else if (status == 0) {
2471
2472 /* no match */
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002473 Py_RETURN_NONE;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002474
2475 }
2476
2477 /* internal error */
2478 pattern_error(status);
2479 return NULL;
2480}
2481
2482
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002483/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002484/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002485
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002486static int
2487scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
2488{
2489 Py_VISIT(Py_TYPE(self));
2490 Py_VISIT(self->pattern);
2491 return 0;
2492}
2493
2494static int
2495scanner_clear(ScannerObject *self)
2496{
2497 Py_CLEAR(self->pattern);
2498 return 0;
2499}
2500
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002501static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002502scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002503{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002504 PyTypeObject *tp = Py_TYPE(self);
2505
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002506 PyObject_GC_UnTrack(self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002507 state_fini(&self->state);
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002508 (void)scanner_clear(self);
2509 tp->tp_free(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002510 Py_DECREF(tp);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002511}
2512
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002513/*[clinic input]
2514_sre.SRE_Scanner.match
2515
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002516 cls: defining_class
2517 /
2518
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002519[clinic start generated code]*/
2520
2521static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002522_sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2523/*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002524{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002525 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002526 SRE_STATE* state = &self->state;
2527 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002528 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002529
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002530 if (state->start == NULL)
2531 Py_RETURN_NONE;
2532
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002533 state_reset(state);
2534
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002535 state->ptr = state->start;
2536
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002537 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002538 if (PyErr_Occurred())
2539 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002540
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002541 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2542 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002543
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002544 if (status == 0)
2545 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002546 else {
2547 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002548 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002549 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002550
2551 return match;
2552}
2553
2554
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002555/*[clinic input]
2556_sre.SRE_Scanner.search
2557
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002558 cls: defining_class
2559 /
2560
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002561[clinic start generated code]*/
2562
2563static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002564_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2565/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002566{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002567 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002568 SRE_STATE* state = &self->state;
2569 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002570 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002571
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002572 if (state->start == NULL)
2573 Py_RETURN_NONE;
2574
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002575 state_reset(state);
2576
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002577 state->ptr = state->start;
2578
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002579 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002580 if (PyErr_Occurred())
2581 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002582
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002583 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2584 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002585
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002586 if (status == 0)
2587 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002588 else {
2589 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002590 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002591 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002592
2593 return match;
2594}
2595
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002596static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002597pattern_scanner(_sremodulestate *module_state,
2598 PatternObject *self,
2599 PyObject *string,
2600 Py_ssize_t pos,
2601 Py_ssize_t endpos)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002602{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002603 ScannerObject* scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002604
2605 /* create scanner object */
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002606 scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002607 if (!scanner)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002608 return NULL;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002609 scanner->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002610
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002611 /* create search state object */
2612 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2613 Py_DECREF(scanner);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002614 return NULL;
2615 }
2616
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002617 Py_INCREF(self);
2618 scanner->pattern = (PyObject*) self;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002619
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002620 PyObject_GC_Track(scanner);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002621 return (PyObject*) scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002622}
2623
Victor Stinnerb44fb122016-11-21 16:35:08 +01002624static Py_hash_t
2625pattern_hash(PatternObject *self)
2626{
2627 Py_hash_t hash, hash2;
2628
2629 hash = PyObject_Hash(self->pattern);
2630 if (hash == -1) {
2631 return -1;
2632 }
2633
2634 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2635 hash ^= hash2;
2636
2637 hash ^= self->flags;
2638 hash ^= self->isbytes;
2639 hash ^= self->codesize;
2640
2641 if (hash == -1) {
2642 hash = -2;
2643 }
2644 return hash;
2645}
2646
2647static PyObject*
2648pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2649{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002650 PyTypeObject *tp = Py_TYPE(lefto);
2651 _sremodulestate *module_state = get_sre_module_state_by_class(tp);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002652 PatternObject *left, *right;
2653 int cmp;
2654
2655 if (op != Py_EQ && op != Py_NE) {
2656 Py_RETURN_NOTIMPLEMENTED;
2657 }
2658
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002659 if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2660 {
Victor Stinnerb44fb122016-11-21 16:35:08 +01002661 Py_RETURN_NOTIMPLEMENTED;
2662 }
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01002663
2664 if (lefto == righto) {
2665 /* a pattern is equal to itself */
2666 return PyBool_FromLong(op == Py_EQ);
2667 }
2668
Victor Stinnerb44fb122016-11-21 16:35:08 +01002669 left = (PatternObject *)lefto;
2670 right = (PatternObject *)righto;
2671
2672 cmp = (left->flags == right->flags
2673 && left->isbytes == right->isbytes
Victor Stinnere670b2d2016-11-22 15:23:00 +01002674 && left->codesize == right->codesize);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002675 if (cmp) {
2676 /* Compare the code and the pattern because the same pattern can
2677 produce different codes depending on the locale used to compile the
2678 pattern when the re.LOCALE flag is used. Don't compare groups,
2679 indexgroup nor groupindex: they are derivated from the pattern. */
2680 cmp = (memcmp(left->code, right->code,
2681 sizeof(left->code[0]) * left->codesize) == 0);
2682 }
2683 if (cmp) {
2684 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2685 Py_EQ);
2686 if (cmp < 0) {
2687 return NULL;
2688 }
2689 }
2690 if (op == Py_NE) {
2691 cmp = !cmp;
2692 }
2693 return PyBool_FromLong(cmp);
2694}
2695
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002696#include "clinic/_sre.c.h"
2697
2698static PyMethodDef pattern_methods[] = {
2699 _SRE_SRE_PATTERN_MATCH_METHODDEF
2700 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2701 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2702 _SRE_SRE_PATTERN_SUB_METHODDEF
2703 _SRE_SRE_PATTERN_SUBN_METHODDEF
2704 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2705 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2706 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2707 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2708 _SRE_SRE_PATTERN___COPY___METHODDEF
2709 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002710 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2711 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002712 {NULL, NULL}
2713};
2714
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002715static PyGetSetDef pattern_getset[] = {
2716 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2717 "A dictionary mapping group names to group numbers."},
2718 {NULL} /* Sentinel */
2719};
2720
2721#define PAT_OFF(x) offsetof(PatternObject, x)
2722static PyMemberDef pattern_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002723 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2724 "The pattern string from which the RE object was compiled."},
2725 {"flags", T_INT, PAT_OFF(flags), READONLY,
2726 "The regex matching flags."},
2727 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2728 "The number of capturing groups in the pattern."},
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002729 {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002730 {NULL} /* Sentinel */
2731};
2732
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002733static PyType_Slot pattern_slots[] = {
2734 {Py_tp_dealloc, (destructor)pattern_dealloc},
2735 {Py_tp_repr, (reprfunc)pattern_repr},
2736 {Py_tp_hash, (hashfunc)pattern_hash},
2737 {Py_tp_doc, (void *)pattern_doc},
2738 {Py_tp_richcompare, pattern_richcompare},
2739 {Py_tp_methods, pattern_methods},
2740 {Py_tp_members, pattern_members},
2741 {Py_tp_getset, pattern_getset},
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002742 {Py_tp_traverse, pattern_traverse},
2743 {Py_tp_clear, pattern_clear},
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002744 {0, NULL},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002745};
2746
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002747static PyType_Spec pattern_spec = {
2748 .name = "re.Pattern",
2749 .basicsize = sizeof(PatternObject),
2750 .itemsize = sizeof(SRE_CODE),
Erlend Egeberg Aasland9746cda2021-04-30 16:04:57 +02002751 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002752 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002753 .slots = pattern_slots,
Eric V. Smith605bdae2016-09-11 08:55:43 -04002754};
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002755
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002756static PyMethodDef match_methods[] = {
2757 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2758 _SRE_SRE_MATCH_START_METHODDEF
2759 _SRE_SRE_MATCH_END_METHODDEF
2760 _SRE_SRE_MATCH_SPAN_METHODDEF
2761 _SRE_SRE_MATCH_GROUPS_METHODDEF
2762 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2763 _SRE_SRE_MATCH_EXPAND_METHODDEF
2764 _SRE_SRE_MATCH___COPY___METHODDEF
2765 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002766 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2767 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002768 {NULL, NULL}
2769};
2770
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002771static PyGetSetDef match_getset[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002772 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2773 "The integer index of the last matched capturing group."},
2774 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2775 "The name of the last matched capturing group."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002776 {"regs", (getter)match_regs_get, (setter)NULL},
2777 {NULL}
2778};
2779
2780#define MATCH_OFF(x) offsetof(MatchObject, x)
2781static PyMemberDef match_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002782 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2783 "The string passed to match() or search()."},
2784 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2785 "The regular expression object."},
2786 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2787 "The index into the string at which the RE engine started looking for a match."},
2788 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2789 "The index into the string beyond which the RE engine will not go."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002790 {NULL}
2791};
2792
2793/* FIXME: implement setattr("string", None) as a special case (to
2794 detach the associated string, if any */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002795static PyType_Slot match_slots[] = {
2796 {Py_tp_dealloc, match_dealloc},
2797 {Py_tp_repr, match_repr},
2798 {Py_tp_doc, (void *)match_doc},
2799 {Py_tp_methods, match_methods},
2800 {Py_tp_members, match_members},
2801 {Py_tp_getset, match_getset},
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002802 {Py_tp_traverse, match_traverse},
2803 {Py_tp_clear, match_clear},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002804
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002805 /* As mapping.
2806 *
2807 * Match objects do not support length or assignment, but do support
2808 * __getitem__.
2809 */
2810 {Py_mp_subscript, match_getitem},
2811
2812 {0, NULL},
2813};
2814
2815static PyType_Spec match_spec = {
2816 .name = "re.Match",
2817 .basicsize = sizeof(MatchObject),
2818 .itemsize = sizeof(Py_ssize_t),
Erlend Egeberg Aasland9746cda2021-04-30 16:04:57 +02002819 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002820 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002821 .slots = match_slots,
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002822};
2823
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002824static PyMethodDef scanner_methods[] = {
2825 _SRE_SRE_SCANNER_MATCH_METHODDEF
2826 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2827 {NULL, NULL}
2828};
2829
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002830#define SCAN_OFF(x) offsetof(ScannerObject, x)
2831static PyMemberDef scanner_members[] = {
2832 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2833 {NULL} /* Sentinel */
2834};
2835
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002836static PyType_Slot scanner_slots[] = {
2837 {Py_tp_dealloc, scanner_dealloc},
2838 {Py_tp_methods, scanner_methods},
2839 {Py_tp_members, scanner_members},
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002840 {Py_tp_traverse, scanner_traverse},
2841 {Py_tp_clear, scanner_clear},
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002842 {0, NULL},
2843};
2844
2845static PyType_Spec scanner_spec = {
2846 .name = "_" SRE_MODULE ".SRE_Scanner",
2847 .basicsize = sizeof(ScannerObject),
Erlend Egeberg Aasland9746cda2021-04-30 16:04:57 +02002848 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
Miss Islington (bot)da9e0cb2021-05-27 17:23:37 -07002849 Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002850 .slots = scanner_slots,
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002851};
2852
Guido van Rossumb700df92000-03-31 14:59:30 +00002853static PyMethodDef _functions[] = {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002854 _SRE_COMPILE_METHODDEF
2855 _SRE_GETCODESIZE_METHODDEF
Serhiy Storchaka6d336a02017-05-09 23:37:14 +03002856 _SRE_ASCII_ISCASED_METHODDEF
2857 _SRE_UNICODE_ISCASED_METHODDEF
Serhiy Storchaka7186cc22017-05-05 10:42:46 +03002858 _SRE_ASCII_TOLOWER_METHODDEF
2859 _SRE_UNICODE_TOLOWER_METHODDEF
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002860 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002861};
2862
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002863static int
2864sre_traverse(PyObject *module, visitproc visit, void *arg)
2865{
2866 _sremodulestate *state = get_sre_module_state(module);
2867
2868 Py_VISIT(state->Pattern_Type);
2869 Py_VISIT(state->Match_Type);
2870 Py_VISIT(state->Scanner_Type);
2871
2872 return 0;
2873}
2874
2875static int
2876sre_clear(PyObject *module)
2877{
2878 _sremodulestate *state = get_sre_module_state(module);
2879
2880 Py_CLEAR(state->Pattern_Type);
2881 Py_CLEAR(state->Match_Type);
2882 Py_CLEAR(state->Scanner_Type);
2883
2884 return 0;
2885}
2886
2887static void
2888sre_free(void *module)
2889{
2890 sre_clear((PyObject *)module);
2891}
2892
2893#define CREATE_TYPE(m, type, spec) \
2894do { \
2895 type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2896 if (type == NULL) { \
2897 goto error; \
2898 } \
2899} while (0)
2900
2901#define ADD_ULONG_CONSTANT(module, name, value) \
2902 do { \
2903 PyObject *o = PyLong_FromUnsignedLong(value); \
2904 if (!o) \
2905 goto error; \
2906 int res = PyModule_AddObjectRef(module, name, o); \
2907 Py_DECREF(o); \
2908 if (res < 0) { \
2909 goto error; \
2910 } \
2911} while (0)
2912
2913static int
2914sre_exec(PyObject *m)
2915{
2916 _sremodulestate *state;
2917
2918 /* Create heap types */
2919 state = get_sre_module_state(m);
2920 CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2921 CREATE_TYPE(m, state->Match_Type, &match_spec);
2922 CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2923
2924 if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
2925 goto error;
2926 }
2927
2928 if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
2929 goto error;
2930 }
2931
2932 ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
2933 ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
2934
2935 if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
2936 goto error;
2937 }
2938
2939 return 0;
2940
2941error:
2942 return -1;
2943}
2944
2945static PyModuleDef_Slot sre_slots[] = {
2946 {Py_mod_exec, sre_exec},
2947 {0, NULL},
Martin v. Löwis1a214512008-06-11 05:26:20 +00002948};
2949
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002950static struct PyModuleDef sremodule = {
2951 .m_base = PyModuleDef_HEAD_INIT,
2952 .m_name = "_" SRE_MODULE,
2953 .m_size = sizeof(_sremodulestate),
2954 .m_methods = _functions,
2955 .m_slots = sre_slots,
2956 .m_traverse = sre_traverse,
2957 .m_free = sre_free,
2958 .m_clear = sre_clear,
2959};
2960
2961PyMODINIT_FUNC
2962PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002963{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002964 return PyModuleDef_Init(&sremodule);
Guido van Rossumb700df92000-03-31 14:59:30 +00002965}
2966
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002967/* vim:ts=4:sw=4:et
2968*/