blob: d4bfff6e849e375cce8a3a82ce9e9502164f3f92 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020038static const char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Victor Stinner37834132020-10-27 17:12:53 +010044#include "pycore_long.h" // _PyLong_GetZero()
Victor Stinnercdad2722021-04-22 00:52:52 +020045#include "pycore_moduleobject.h" // _PyModule_GetState()
Victor Stinner4a21e572020-04-15 02:35:41 +020046#include "structmember.h" // PyMemberDef
Guido van Rossumb700df92000-03-31 14:59:30 +000047
48#include "sre.h"
49
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030050#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
51
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000052#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000053
Fredrik Lundh436c3d582000-06-29 08:58:44 +000054/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000055#if !defined(SRE_MODULE)
56#define SRE_MODULE "sre"
57#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000058
Thomas Wouters9ada3d62006-04-21 09:47:09 +000059#define SRE_PY_MODULE "re"
60
Guido van Rossumb700df92000-03-31 14:59:30 +000061/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000062#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000065
Fredrik Lundh80946112000-06-29 18:03:25 +000066#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000067#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000068#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000069/* fastest possible local call under MSVC */
70#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000071#else
Benjamin Peterson791dc832017-04-20 23:52:19 -070072#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000073#endif
74
75/* error codes */
76#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000077#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000078#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000079#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000080#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000081
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000082#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000083#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000084#else
85#define TRACE(v)
86#endif
87
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000088/* -------------------------------------------------------------------- */
89/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000090
Fredrik Lundh436c3d582000-06-29 08:58:44 +000091#define SRE_IS_DIGIT(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050092 ((ch) <= '9' && Py_ISDIGIT(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000093#define SRE_IS_SPACE(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050094 ((ch) <= ' ' && Py_ISSPACE(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000095#define SRE_IS_LINEBREAK(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +030096 ((ch) == '\n')
Fredrik Lundh436c3d582000-06-29 08:58:44 +000097#define SRE_IS_WORD(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050098 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
Guido van Rossumb700df92000-03-31 14:59:30 +000099
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300100static unsigned int sre_lower_ascii(unsigned int ch)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000101{
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300102 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000103}
104
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000105/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000106/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
107 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000108#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000109#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
110
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000111static unsigned int sre_lower_locale(unsigned int ch)
112{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000113 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000114}
115
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200116static unsigned int sre_upper_locale(unsigned int ch)
117{
118 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
119}
120
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000121/* unicode-specific character predicates */
122
Victor Stinner0058b862011-09-29 03:27:47 +0200123#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
124#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
125#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
126#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
127#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000128
129static unsigned int sre_lower_unicode(unsigned int ch)
130{
Victor Stinner0058b862011-09-29 03:27:47 +0200131 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000132}
133
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200134static unsigned int sre_upper_unicode(unsigned int ch)
135{
136 return (unsigned int) Py_UNICODE_TOUPPER(ch);
137}
138
Guido van Rossumb700df92000-03-31 14:59:30 +0000139LOCAL(int)
140sre_category(SRE_CODE category, unsigned int ch)
141{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000142 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000143
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000144 case SRE_CATEGORY_DIGIT:
145 return SRE_IS_DIGIT(ch);
146 case SRE_CATEGORY_NOT_DIGIT:
147 return !SRE_IS_DIGIT(ch);
148 case SRE_CATEGORY_SPACE:
149 return SRE_IS_SPACE(ch);
150 case SRE_CATEGORY_NOT_SPACE:
151 return !SRE_IS_SPACE(ch);
152 case SRE_CATEGORY_WORD:
153 return SRE_IS_WORD(ch);
154 case SRE_CATEGORY_NOT_WORD:
155 return !SRE_IS_WORD(ch);
156 case SRE_CATEGORY_LINEBREAK:
157 return SRE_IS_LINEBREAK(ch);
158 case SRE_CATEGORY_NOT_LINEBREAK:
159 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000161 case SRE_CATEGORY_LOC_WORD:
162 return SRE_LOC_IS_WORD(ch);
163 case SRE_CATEGORY_LOC_NOT_WORD:
164 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000165
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000166 case SRE_CATEGORY_UNI_DIGIT:
167 return SRE_UNI_IS_DIGIT(ch);
168 case SRE_CATEGORY_UNI_NOT_DIGIT:
169 return !SRE_UNI_IS_DIGIT(ch);
170 case SRE_CATEGORY_UNI_SPACE:
171 return SRE_UNI_IS_SPACE(ch);
172 case SRE_CATEGORY_UNI_NOT_SPACE:
173 return !SRE_UNI_IS_SPACE(ch);
174 case SRE_CATEGORY_UNI_WORD:
175 return SRE_UNI_IS_WORD(ch);
176 case SRE_CATEGORY_UNI_NOT_WORD:
177 return !SRE_UNI_IS_WORD(ch);
178 case SRE_CATEGORY_UNI_LINEBREAK:
179 return SRE_UNI_IS_LINEBREAK(ch);
180 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
181 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000182 }
183 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000184}
185
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300186LOCAL(int)
187char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
188{
189 return ch == pattern
190 || (SRE_CODE) sre_lower_locale(ch) == pattern
191 || (SRE_CODE) sre_upper_locale(ch) == pattern;
192}
193
194
Guido van Rossumb700df92000-03-31 14:59:30 +0000195/* helpers */
196
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000197static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000198data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000199{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000200 if (state->data_stack) {
Victor Stinner00d7abd2020-12-01 09:56:42 +0100201 PyMem_Free(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000202 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000203 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000204 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000205}
206
207static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000208data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000209{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000210 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000211 minsize = state->data_stack_base+size;
212 cursize = state->data_stack_size;
213 if (cursize < minsize) {
214 void* stack;
215 cursize = minsize+minsize/4+1024;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200216 TRACE(("allocate/grow stack %zd\n", cursize));
Victor Stinner00d7abd2020-12-01 09:56:42 +0100217 stack = PyMem_Realloc(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000218 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000219 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000220 return SRE_ERROR_MEMORY;
221 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000222 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000223 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000224 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000225 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000226}
227
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000228/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000229
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300230#define SRE_CHAR Py_UCS1
231#define SIZEOF_SRE_CHAR 1
232#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300233#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000234
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300235/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000236
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300237#define SRE_CHAR Py_UCS2
238#define SIZEOF_SRE_CHAR 2
239#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300240#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000241
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300242/* generate 32-bit unicode version */
243
244#define SRE_CHAR Py_UCS4
245#define SIZEOF_SRE_CHAR 4
246#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300247#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000248
249/* -------------------------------------------------------------------- */
250/* factories and destructors */
251
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100252/* module state */
253typedef struct {
254 PyTypeObject *Pattern_Type;
255 PyTypeObject *Match_Type;
256 PyTypeObject *Scanner_Type;
257} _sremodulestate;
Guido van Rossumb700df92000-03-31 14:59:30 +0000258
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100259static _sremodulestate *
260get_sre_module_state(PyObject *m)
261{
Victor Stinnercdad2722021-04-22 00:52:52 +0200262 _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100263 assert(state);
264 return state;
265}
266
267static struct PyModuleDef sremodule;
268#define get_sre_module_state_by_class(cls) \
269 (get_sre_module_state(PyType_GetModule(cls)))
270
271/* see sre.h for object declarations */
272static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
273static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300274
275/*[clinic input]
276module _sre
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100277class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
278class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
279class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300280[clinic start generated code]*/
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100281/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
Larry Hastings2d0a69a2015-05-03 14:49:19 -0700282
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300283/*[clinic input]
284_sre.getcodesize -> int
285[clinic start generated code]*/
286
287static int
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +0300288_sre_getcodesize_impl(PyObject *module)
289/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000290{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300291 return sizeof(SRE_CODE);
Guido van Rossumb700df92000-03-31 14:59:30 +0000292}
293
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300294/*[clinic input]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300295_sre.ascii_iscased -> bool
296
297 character: int
298 /
299
300[clinic start generated code]*/
301
302static int
303_sre_ascii_iscased_impl(PyObject *module, int character)
304/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
305{
306 unsigned int ch = (unsigned int)character;
Sergey Fedoseev7f0d59f2018-09-12 17:49:09 +0500307 return ch < 128 && Py_ISALPHA(ch);
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300308}
309
310/*[clinic input]
311_sre.unicode_iscased -> bool
312
313 character: int
314 /
315
316[clinic start generated code]*/
317
318static int
319_sre_unicode_iscased_impl(PyObject *module, int character)
320/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
321{
322 unsigned int ch = (unsigned int)character;
323 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
324}
325
326/*[clinic input]
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300327_sre.ascii_tolower -> int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300328
329 character: int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300330 /
331
332[clinic start generated code]*/
333
334static int
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300335_sre_ascii_tolower_impl(PyObject *module, int character)
336/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000337{
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300338 return sre_lower_ascii(character);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000339}
340
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300341/*[clinic input]
342_sre.unicode_tolower -> int
343
344 character: int
345 /
346
347[clinic start generated code]*/
348
349static int
350_sre_unicode_tolower_impl(PyObject *module, int character)
351/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
352{
353 return sre_lower_unicode(character);
354}
355
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000356LOCAL(void)
357state_reset(SRE_STATE* state)
358{
animalize4a7f44a2019-02-18 21:26:37 +0800359 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000360 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000361
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000362 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000363 state->lastindex = -1;
364
365 state->repeat = NULL;
366
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000367 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000368}
369
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300370static const void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300372 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600373 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000374{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000375 /* given a python object, return a data pointer, a length (in
376 characters), and a character size. return NULL if the object
377 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000378
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000379 /* Unicode objects do not support the buffer API. So, get the data
380 directly instead. */
381 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200382 if (PyUnicode_READY(string) == -1)
383 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200385 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300386 *p_isbytes = 0;
387 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000388 }
389
Victor Stinner0058b862011-09-29 03:27:47 +0200390 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300391 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200392 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300393 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000394 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000395
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300396 *p_length = view->len;
397 *p_charsize = 1;
398 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000399
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300400 if (view->buf == NULL) {
401 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
402 PyBuffer_Release(view);
403 view->buf = NULL;
404 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000405 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300406 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000407}
408
409LOCAL(PyObject*)
410state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000411 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000412{
413 /* prepare state object */
414
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000415 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300416 int isbytes, charsize;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300417 const void* ptr;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000418
419 memset(state, 0, sizeof(SRE_STATE));
420
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300421 state->mark = PyMem_New(const void *, pattern->groups * 2);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300422 if (!state->mark) {
423 PyErr_NoMemory();
424 goto err;
425 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000426 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000427 state->lastindex = -1;
428
Benjamin Petersone48944b2012-03-07 14:50:25 -0600429 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300430 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000431 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600432 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000433
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300434 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600435 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200436 "cannot use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600437 goto err;
438 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300439 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600440 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200441 "cannot use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600442 goto err;
443 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000444
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000445 /* adjust boundaries */
446 if (start < 0)
447 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000448 else if (start > length)
449 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000450
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000451 if (end < 0)
452 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000453 else if (end > length)
454 end = length;
455
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300456 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000457 state->charsize = charsize;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200458 state->match_all = 0;
459 state->must_advance = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000460
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000461 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000462
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000463 state->start = (void*) ((char*) ptr + start * state->charsize);
464 state->end = (void*) ((char*) ptr + end * state->charsize);
465
466 Py_INCREF(string);
467 state->string = string;
468 state->pos = start;
469 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000470
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000471 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600472 err:
Ammar Askar06e3a272020-06-01 17:21:43 +0000473 /* We add an explicit cast here because MSVC has a bug when
474 compiling C code where it believes that `const void**` cannot be
475 safely casted to `void*`, see bpo-39943 for details. */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100476 PyMem_Free((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300477 state->mark = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600478 if (state->buffer.buf)
479 PyBuffer_Release(&state->buffer);
480 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000481}
482
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000483LOCAL(void)
484state_fini(SRE_STATE* state)
485{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600486 if (state->buffer.buf)
487 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000488 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000489 data_stack_dealloc(state);
Ammar Askar06e3a272020-06-01 17:21:43 +0000490 /* See above PyMem_Del for why we explicitly cast here. */
Victor Stinner00d7abd2020-12-01 09:56:42 +0100491 PyMem_Free((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300492 state->mark = NULL;
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000493}
494
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000495/* calculate offset from start of string */
496#define STATE_OFFSET(state, member)\
497 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
498
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000499LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300500getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300501 PyObject* string, Py_ssize_t start, Py_ssize_t end)
502{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300503 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300504 if (PyBytes_CheckExact(string) &&
505 start == 0 && end == PyBytes_GET_SIZE(string)) {
506 Py_INCREF(string);
507 return string;
508 }
509 return PyBytes_FromStringAndSize(
510 (const char *)ptr + start, end - start);
511 }
512 else {
513 return PyUnicode_Substring(string, start, end);
514 }
515}
516
517LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000518state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000519{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000520 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000521
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000522 index = (index - 1) * 2;
523
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000524 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000525 if (empty)
526 /* want empty string */
527 i = j = 0;
528 else {
Serhiy Storchaka228b12e2017-01-23 09:47:21 +0200529 Py_RETURN_NONE;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000530 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000531 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000532 i = STATE_OFFSET(state, state->mark[index]);
533 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000534 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000535
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300536 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000537}
538
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000539static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100540pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000541{
542 switch (status) {
543 case SRE_ERROR_RECURSION_LIMIT:
Yury Selivanovf488fb42015-07-03 01:04:23 -0400544 /* This error code seems to be unused. */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000545 PyErr_SetString(
Yury Selivanovf488fb42015-07-03 01:04:23 -0400546 PyExc_RecursionError,
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000547 "maximum recursion limit exceeded"
548 );
549 break;
550 case SRE_ERROR_MEMORY:
551 PyErr_NoMemory();
552 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000553 case SRE_ERROR_INTERRUPTED:
554 /* An exception has already been raised, so let it fly */
555 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000556 default:
557 /* other error codes indicate compiler/engine bugs */
558 PyErr_SetString(
559 PyExc_RuntimeError,
560 "internal error in regular expression engine"
561 );
562 }
563}
564
Guido van Rossumb700df92000-03-31 14:59:30 +0000565static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000566pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000567{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100568 PyTypeObject *tp = Py_TYPE(self);
569
Raymond Hettinger027bb632004-05-31 03:09:25 +0000570 if (self->weakreflist != NULL)
571 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000572 Py_XDECREF(self->pattern);
573 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000574 Py_XDECREF(self->indexgroup);
Victor Stinner32bd68c2020-12-01 10:37:39 +0100575 PyObject_Free(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100576 Py_DECREF(tp);
Guido van Rossumb700df92000-03-31 14:59:30 +0000577}
578
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300579LOCAL(Py_ssize_t)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200580sre_match(SRE_STATE* state, SRE_CODE* pattern)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300581{
582 if (state->charsize == 1)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200583 return sre_ucs1_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300584 if (state->charsize == 2)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200585 return sre_ucs2_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300586 assert(state->charsize == 4);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200587 return sre_ucs4_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300588}
589
590LOCAL(Py_ssize_t)
591sre_search(SRE_STATE* state, SRE_CODE* pattern)
592{
593 if (state->charsize == 1)
594 return sre_ucs1_search(state, pattern);
595 if (state->charsize == 2)
596 return sre_ucs2_search(state, pattern);
597 assert(state->charsize == 4);
598 return sre_ucs4_search(state, pattern);
599}
600
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300601/*[clinic input]
602_sre.SRE_Pattern.match
603
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100604 cls: defining_class
605 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200606 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300607 pos: Py_ssize_t = 0
608 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300609
610Matches zero or more characters at the beginning of the string.
611[clinic start generated code]*/
612
Larry Hastings16c51912014-01-07 11:53:01 -0800613static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100614_sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
615 PyObject *string, Py_ssize_t pos,
616 Py_ssize_t endpos)
617/*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
Larry Hastings16c51912014-01-07 11:53:01 -0800618{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100619 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000620 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100621 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300622 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000623
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300624 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000625 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000626
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000627 state.ptr = state.start;
628
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000629 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
630
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200631 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000632
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000633 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300634 if (PyErr_Occurred()) {
635 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000636 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300637 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000638
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100639 match = pattern_new_match(module_state, self, &state, status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000640 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300641 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000642}
643
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300644/*[clinic input]
645_sre.SRE_Pattern.fullmatch
646
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100647 cls: defining_class
648 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200649 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300650 pos: Py_ssize_t = 0
651 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300652
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300653Matches against all of the string.
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300654[clinic start generated code]*/
655
656static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100657_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
658 PyObject *string, Py_ssize_t pos,
659 Py_ssize_t endpos)
660/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200661{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100662 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200663 SRE_STATE state;
664 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300665 PyObject *match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200666
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300667 if (!state_init(&state, self, string, pos, endpos))
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200668 return NULL;
669
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200670 state.ptr = state.start;
671
672 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
673
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200674 state.match_all = 1;
675 status = sre_match(&state, PatternObject_GetCode(self));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200676
677 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300678 if (PyErr_Occurred()) {
679 state_fini(&state);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200680 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300681 }
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200682
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100683 match = pattern_new_match(module_state, self, &state, status);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200684 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300685 return match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200686}
687
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300688/*[clinic input]
689_sre.SRE_Pattern.search
690
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100691 cls: defining_class
692 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200693 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300694 pos: Py_ssize_t = 0
695 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300696
697Scan through string looking for a match, and return a corresponding match object instance.
698
699Return None if no position in the string matches.
700[clinic start generated code]*/
701
702static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100703_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
704 PyObject *string, Py_ssize_t pos,
705 Py_ssize_t endpos)
706/*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000707{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100708 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000709 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100710 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300711 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000712
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300713 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000714 return NULL;
715
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000716 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
717
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300718 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000719
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000720 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
721
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300722 if (PyErr_Occurred()) {
723 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000724 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300725 }
Thomas Wouters89f507f2006-12-13 04:49:30 +0000726
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100727 match = pattern_new_match(module_state, self, &state, status);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300728 state_fini(&state);
729 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000730}
731
732static PyObject*
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200733call(const char* module, const char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000734{
735 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000736 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000737 PyObject* func;
738 PyObject* result;
739
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000740 if (!args)
741 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000742 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000743 if (!name)
744 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000745 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000746 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000747 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000748 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000749 func = PyObject_GetAttrString(mod, function);
750 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000751 if (!func)
752 return NULL;
753 result = PyObject_CallObject(func, args);
754 Py_DECREF(func);
755 Py_DECREF(args);
756 return result;
757}
758
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300759/*[clinic input]
760_sre.SRE_Pattern.findall
761
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200762 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300763 pos: Py_ssize_t = 0
764 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300765
766Return a list of all non-overlapping matches of pattern in string.
767[clinic start generated code]*/
768
769static PyObject *
770_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200771 Py_ssize_t pos, Py_ssize_t endpos)
772/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000773{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000774 SRE_STATE state;
775 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100776 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000777 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000778
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300779 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000780 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000781
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000782 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000783 if (!list) {
784 state_fini(&state);
785 return NULL;
786 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000787
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000788 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000789
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000790 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000791
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000792 state_reset(&state);
793
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000794 state.ptr = state.start;
795
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300796 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300797 if (PyErr_Occurred())
798 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000799
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000800 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000801 if (status == 0)
802 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000803 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000805 }
Tim Peters3d563502006-01-21 02:47:53 +0000806
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000807 /* don't bother to build a match object */
808 switch (self->groups) {
809 case 0:
810 b = STATE_OFFSET(&state, state.start);
811 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300812 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300813 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000814 if (!item)
815 goto error;
816 break;
817 case 1:
818 item = state_getslice(&state, 1, string, 1);
819 if (!item)
820 goto error;
821 break;
822 default:
823 item = PyTuple_New(self->groups);
824 if (!item)
825 goto error;
826 for (i = 0; i < self->groups; i++) {
827 PyObject* o = state_getslice(&state, i+1, string, 1);
828 if (!o) {
829 Py_DECREF(item);
830 goto error;
831 }
832 PyTuple_SET_ITEM(item, i, o);
833 }
834 break;
835 }
836
837 status = PyList_Append(list, item);
838 Py_DECREF(item);
839 if (status < 0)
840 goto error;
841
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200842 state.must_advance = (state.ptr == state.start);
843 state.start = state.ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000844 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000845
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000846 state_fini(&state);
847 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000848
849error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000850 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000851 state_fini(&state);
852 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000853
Guido van Rossumb700df92000-03-31 14:59:30 +0000854}
855
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300856/*[clinic input]
857_sre.SRE_Pattern.finditer
858
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100859 cls: defining_class
860 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300861 string: object
862 pos: Py_ssize_t = 0
863 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
864
865Return an iterator over all non-overlapping matches for the RE pattern in string.
866
867For each match, the iterator returns a match object.
868[clinic start generated code]*/
869
870static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100871_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
872 PyObject *string, Py_ssize_t pos,
873 Py_ssize_t endpos)
874/*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
Fredrik Lundh703ce812001-10-24 22:16:30 +0000875{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100876 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000877 PyObject* scanner;
878 PyObject* search;
879 PyObject* iterator;
880
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100881 scanner = pattern_scanner(module_state, self, string, pos, endpos);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000882 if (!scanner)
883 return NULL;
884
885 search = PyObject_GetAttrString(scanner, "search");
886 Py_DECREF(scanner);
887 if (!search)
888 return NULL;
889
890 iterator = PyCallIter_New(search, Py_None);
891 Py_DECREF(search);
892
893 return iterator;
894}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000895
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300896/*[clinic input]
897_sre.SRE_Pattern.scanner
898
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100899 cls: defining_class
900 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300901 string: object
902 pos: Py_ssize_t = 0
903 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
904
905[clinic start generated code]*/
906
907static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100908_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
909 PyObject *string, Py_ssize_t pos,
910 Py_ssize_t endpos)
911/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300912{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100913 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
914
915 return pattern_scanner(module_state, self, string, pos, endpos);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300916}
917
918/*[clinic input]
919_sre.SRE_Pattern.split
920
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200921 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300922 maxsplit: Py_ssize_t = 0
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300923
924Split string by the occurrences of pattern.
925[clinic start generated code]*/
926
927static PyObject *
928_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200929 Py_ssize_t maxsplit)
930/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000931{
932 SRE_STATE state;
933 PyObject* list;
934 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100935 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000936 Py_ssize_t n;
937 Py_ssize_t i;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300938 const void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000939
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200940 assert(self->codesize != 0);
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200941
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300942 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000943 return NULL;
944
945 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000946 if (!list) {
947 state_fini(&state);
948 return NULL;
949 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000950
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000951 n = 0;
952 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000953
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000954 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000955
956 state_reset(&state);
957
958 state.ptr = state.start;
959
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300960 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300961 if (PyErr_Occurred())
962 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000963
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000964 if (status <= 0) {
965 if (status == 0)
966 break;
967 pattern_error(status);
968 goto error;
969 }
Tim Peters3d563502006-01-21 02:47:53 +0000970
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000971 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300972 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000973 string, STATE_OFFSET(&state, last),
974 STATE_OFFSET(&state, state.start)
975 );
976 if (!item)
977 goto error;
978 status = PyList_Append(list, item);
979 Py_DECREF(item);
980 if (status < 0)
981 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000982
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000983 /* add groups (if any) */
984 for (i = 0; i < self->groups; i++) {
985 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000986 if (!item)
987 goto error;
988 status = PyList_Append(list, item);
989 Py_DECREF(item);
990 if (status < 0)
991 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000992 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000993
994 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200995 state.must_advance = (state.ptr == state.start);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000996 last = state.start = state.ptr;
997
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000998 }
999
Fredrik Lundhf864aa82001-10-22 06:01:56 +00001000 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001001 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +00001002 string, STATE_OFFSET(&state, last), state.endpos
1003 );
1004 if (!item)
1005 goto error;
1006 status = PyList_Append(list, item);
1007 Py_DECREF(item);
1008 if (status < 0)
1009 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001010
1011 state_fini(&state);
1012 return list;
1013
1014error:
1015 Py_DECREF(list);
1016 state_fini(&state);
1017 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001018
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001019}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001020
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001021static PyObject*
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001022pattern_subx(_sremodulestate* module_state,
1023 PatternObject* self,
1024 PyObject* ptemplate,
1025 PyObject* string,
1026 Py_ssize_t count,
1027 Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001028{
1029 SRE_STATE state;
1030 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001031 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001032 PyObject* item;
1033 PyObject* filter;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001034 PyObject* match;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001035 const void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01001036 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001037 Py_ssize_t n;
1038 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001039 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001040 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001041 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001042
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001043 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001044 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001045 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001046 Py_INCREF(filter);
1047 filter_is_callable = 1;
1048 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001049 /* if not callable, check if it's a literal string */
1050 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001051 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001052 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001053 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001054 if (charsize == 1)
1055 literal = memchr(ptr, '\\', n) == NULL;
1056 else
1057 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001058 } else {
1059 PyErr_Clear();
1060 literal = 0;
1061 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06001062 if (view.buf)
1063 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001064 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001065 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001066 Py_INCREF(filter);
1067 filter_is_callable = 0;
1068 } else {
1069 /* not a literal; hand it over to the template compiler */
1070 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001071 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001072 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001073 );
1074 if (!filter)
1075 return NULL;
1076 filter_is_callable = PyCallable_Check(filter);
1077 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00001078 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001079
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001080 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001081 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001082 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001083 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001084
1085 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001086 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001087 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001088 state_fini(&state);
1089 return NULL;
1090 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001091
1092 n = i = 0;
1093
1094 while (!count || n < count) {
1095
1096 state_reset(&state);
1097
1098 state.ptr = state.start;
1099
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001100 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001101 if (PyErr_Occurred())
1102 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001103
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001104 if (status <= 0) {
1105 if (status == 0)
1106 break;
1107 pattern_error(status);
1108 goto error;
1109 }
Tim Peters3d563502006-01-21 02:47:53 +00001110
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001111 b = STATE_OFFSET(&state, state.start);
1112 e = STATE_OFFSET(&state, state.ptr);
1113
1114 if (i < b) {
1115 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001116 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001117 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001118 if (!item)
1119 goto error;
1120 status = PyList_Append(list, item);
1121 Py_DECREF(item);
1122 if (status < 0)
1123 goto error;
1124
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001125 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001126
1127 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001128 /* pass match object through filter */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001129 match = pattern_new_match(module_state, self, &state, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001130 if (!match)
1131 goto error;
Petr Viktorinffd97532020-02-11 17:46:57 +01001132 item = PyObject_CallOneArg(filter, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001133 Py_DECREF(match);
1134 if (!item)
1135 goto error;
1136 } else {
1137 /* filter is literal string */
1138 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001139 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001140 }
1141
1142 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001143 if (item != Py_None) {
1144 status = PyList_Append(list, item);
1145 Py_DECREF(item);
1146 if (status < 0)
1147 goto error;
1148 }
Tim Peters3d563502006-01-21 02:47:53 +00001149
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001150 i = e;
1151 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001152 state.must_advance = (state.ptr == state.start);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001153 state.start = state.ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001154 }
1155
1156 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001157 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001158 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001159 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001160 if (!item)
1161 goto error;
1162 status = PyList_Append(list, item);
1163 Py_DECREF(item);
1164 if (status < 0)
1165 goto error;
1166 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001167
1168 state_fini(&state);
1169
Guido van Rossum4e173842001-12-07 04:25:10 +00001170 Py_DECREF(filter);
1171
Fredrik Lundhdac58492001-10-21 21:48:30 +00001172 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001173 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001174 if (!joiner) {
1175 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001176 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001177 }
1178 if (PyList_GET_SIZE(list) == 0) {
1179 Py_DECREF(list);
1180 item = joiner;
1181 }
1182 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001183 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001184 item = _PyBytes_Join(joiner, list);
1185 else
1186 item = PyUnicode_Join(joiner, list);
1187 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001188 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001189 if (!item)
1190 return NULL;
1191 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001192
1193 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001194 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001195
1196 return item;
1197
1198error:
1199 Py_DECREF(list);
1200 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001201 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001202 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001203
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001204}
1205
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001206/*[clinic input]
1207_sre.SRE_Pattern.sub
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001208
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001209 cls: defining_class
1210 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001211 repl: object
1212 string: object
1213 count: Py_ssize_t = 0
1214
1215Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1216[clinic start generated code]*/
1217
1218static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001219_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1220 PyObject *repl, PyObject *string, Py_ssize_t count)
1221/*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001222{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001223 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1224
1225 return pattern_subx(module_state, self, repl, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001226}
1227
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001228/*[clinic input]
1229_sre.SRE_Pattern.subn
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001230
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001231 cls: defining_class
1232 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001233 repl: object
1234 string: object
1235 count: Py_ssize_t = 0
1236
1237Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1238[clinic start generated code]*/
1239
1240static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001241_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1242 PyObject *repl, PyObject *string,
1243 Py_ssize_t count)
1244/*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001245{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001246 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1247
1248 return pattern_subx(module_state, self, repl, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001249}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001250
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001251/*[clinic input]
1252_sre.SRE_Pattern.__copy__
1253
1254[clinic start generated code]*/
1255
1256static PyObject *
1257_sre_SRE_Pattern___copy___impl(PatternObject *self)
1258/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001259{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001260 Py_INCREF(self);
1261 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001262}
1263
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001264/*[clinic input]
1265_sre.SRE_Pattern.__deepcopy__
1266
1267 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001268 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001269
1270[clinic start generated code]*/
1271
1272static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001273_sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1274/*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001275{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001276 Py_INCREF(self);
1277 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001278}
1279
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001280static PyObject *
1281pattern_repr(PatternObject *obj)
1282{
1283 static const struct {
1284 const char *name;
1285 int value;
1286 } flag_names[] = {
1287 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1288 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1289 {"re.LOCALE", SRE_FLAG_LOCALE},
1290 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1291 {"re.DOTALL", SRE_FLAG_DOTALL},
1292 {"re.UNICODE", SRE_FLAG_UNICODE},
1293 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1294 {"re.DEBUG", SRE_FLAG_DEBUG},
1295 {"re.ASCII", SRE_FLAG_ASCII},
1296 };
1297 PyObject *result = NULL;
1298 PyObject *flag_items;
Victor Stinner706768c2014-08-16 01:03:39 +02001299 size_t i;
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001300 int flags = obj->flags;
1301
1302 /* Omit re.UNICODE for valid string patterns. */
1303 if (obj->isbytes == 0 &&
1304 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1305 SRE_FLAG_UNICODE)
1306 flags &= ~SRE_FLAG_UNICODE;
1307
1308 flag_items = PyList_New(0);
1309 if (!flag_items)
1310 return NULL;
1311
1312 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1313 if (flags & flag_names[i].value) {
1314 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1315 if (!item)
1316 goto done;
1317
1318 if (PyList_Append(flag_items, item) < 0) {
1319 Py_DECREF(item);
1320 goto done;
1321 }
1322 Py_DECREF(item);
1323 flags &= ~flag_names[i].value;
1324 }
1325 }
1326 if (flags) {
1327 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1328 if (!item)
1329 goto done;
1330
1331 if (PyList_Append(flag_items, item) < 0) {
1332 Py_DECREF(item);
1333 goto done;
1334 }
1335 Py_DECREF(item);
1336 }
1337
1338 if (PyList_Size(flag_items) > 0) {
1339 PyObject *flags_result;
1340 PyObject *sep = PyUnicode_FromString("|");
1341 if (!sep)
1342 goto done;
1343 flags_result = PyUnicode_Join(sep, flag_items);
1344 Py_DECREF(sep);
1345 if (!flags_result)
1346 goto done;
1347 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1348 obj->pattern, flags_result);
1349 Py_DECREF(flags_result);
1350 }
1351 else {
1352 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1353 }
1354
1355done:
1356 Py_DECREF(flag_items);
1357 return result;
1358}
1359
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001360PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001361
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001362/* PatternObject's 'groupindex' method. */
1363static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02001364pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001365{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001366 if (self->groupindex == NULL)
1367 return PyDict_New();
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001368 return PyDictProxy_New(self->groupindex);
1369}
1370
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001371static int _validate(PatternObject *self); /* Forward */
1372
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001373/*[clinic input]
1374_sre.compile
1375
1376 pattern: object
1377 flags: int
1378 code: object(subclass_of='&PyList_Type')
1379 groups: Py_ssize_t
Victor Stinner726a57d2016-11-22 23:04:39 +01001380 groupindex: object(subclass_of='&PyDict_Type')
1381 indexgroup: object(subclass_of='&PyTuple_Type')
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001382
1383[clinic start generated code]*/
1384
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001385static PyObject *
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +03001386_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001387 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1388 PyObject *indexgroup)
Victor Stinner726a57d2016-11-22 23:04:39 +01001389/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390{
1391 /* "compile" pattern descriptor to pattern object */
1392
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001393 _sremodulestate *module_state = get_sre_module_state(module);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001394 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001395 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001396
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001397 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001398 /* coverity[ampersand_in_size] */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001399 self = PyObject_NewVar(PatternObject, module_state->Pattern_Type, n);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001400 if (!self)
1401 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001402 self->weakreflist = NULL;
1403 self->pattern = NULL;
1404 self->groupindex = NULL;
1405 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001406
1407 self->codesize = n;
1408
1409 for (i = 0; i < n; i++) {
1410 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001411 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001412 self->code[i] = (SRE_CODE) value;
1413 if ((unsigned long) self->code[i] != value) {
1414 PyErr_SetString(PyExc_OverflowError,
1415 "regular expression code size limit exceeded");
1416 break;
1417 }
1418 }
1419
1420 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001421 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422 return NULL;
1423 }
1424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001426 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 else {
1429 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001430 int charsize;
1431 Py_buffer view;
1432 view.buf = NULL;
1433 if (!getstring(pattern, &p_length, &self->isbytes,
1434 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435 Py_DECREF(self);
1436 return NULL;
1437 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001438 if (view.buf)
1439 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001441
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001442 Py_INCREF(pattern);
1443 self->pattern = pattern;
1444
1445 self->flags = flags;
1446
1447 self->groups = groups;
1448
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001449 if (PyDict_GET_SIZE(groupindex) > 0) {
1450 Py_INCREF(groupindex);
1451 self->groupindex = groupindex;
1452 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1453 Py_INCREF(indexgroup);
1454 self->indexgroup = indexgroup;
1455 }
1456 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001457
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001458 if (!_validate(self)) {
1459 Py_DECREF(self);
1460 return NULL;
1461 }
1462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001463 return (PyObject*) self;
1464}
1465
Guido van Rossumb700df92000-03-31 14:59:30 +00001466/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001467/* Code validation */
1468
1469/* To learn more about this code, have a look at the _compile() function in
1470 Lib/sre_compile.py. The validation functions below checks the code array
1471 for conformance with the code patterns generated there.
1472
1473 The nice thing about the generated code is that it is position-independent:
1474 all jumps are relative jumps forward. Also, jumps don't cross each other:
1475 the target of a later jump is always earlier than the target of an earlier
1476 jump. IOW, this is okay:
1477
1478 J---------J-------T--------T
1479 \ \_____/ /
1480 \______________________/
1481
1482 but this is not:
1483
1484 J---------J-------T--------T
1485 \_________\_____/ /
1486 \____________/
1487
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001488 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001489*/
1490
1491/* Defining this one enables tracing of the validator */
1492#undef VVERBOSE
1493
1494/* Trace macro for the validator */
1495#if defined(VVERBOSE)
1496#define VTRACE(v) printf v
1497#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001498#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001499#endif
1500
1501/* Report failure */
1502#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1503
1504/* Extract opcode, argument, or skip count from code array */
1505#define GET_OP \
1506 do { \
1507 VTRACE(("%p: ", code)); \
1508 if (code >= end) FAIL; \
1509 op = *code++; \
1510 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1511 } while (0)
1512#define GET_ARG \
1513 do { \
1514 VTRACE(("%p= ", code)); \
1515 if (code >= end) FAIL; \
1516 arg = *code++; \
1517 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1518 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001519#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001520 do { \
1521 VTRACE(("%p= ", code)); \
1522 if (code >= end) FAIL; \
1523 skip = *code; \
1524 VTRACE(("%lu (skip to %p)\n", \
1525 (unsigned long)skip, code+skip)); \
Benjamin Petersonca470632016-09-06 13:47:26 -07001526 if (skip-adj > (uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001527 FAIL; \
1528 code++; \
1529 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001530#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001531
1532static int
1533_validate_charset(SRE_CODE *code, SRE_CODE *end)
1534{
1535 /* Some variables are manipulated by the macros above */
1536 SRE_CODE op;
1537 SRE_CODE arg;
1538 SRE_CODE offset;
1539 int i;
1540
1541 while (code < end) {
1542 GET_OP;
1543 switch (op) {
1544
1545 case SRE_OP_NEGATE:
1546 break;
1547
1548 case SRE_OP_LITERAL:
1549 GET_ARG;
1550 break;
1551
1552 case SRE_OP_RANGE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001553 case SRE_OP_RANGE_UNI_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001554 GET_ARG;
1555 GET_ARG;
1556 break;
1557
1558 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001559 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Benjamin Petersonca470632016-09-06 13:47:26 -07001560 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001561 FAIL;
1562 code += offset;
1563 break;
1564
1565 case SRE_OP_BIGCHARSET:
1566 GET_ARG; /* Number of blocks */
1567 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001568 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001569 FAIL;
1570 /* Make sure that each byte points to a valid block */
1571 for (i = 0; i < 256; i++) {
1572 if (((unsigned char *)code)[i] >= arg)
1573 FAIL;
1574 }
1575 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001576 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Benjamin Petersonca470632016-09-06 13:47:26 -07001577 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001578 FAIL;
1579 code += offset;
1580 break;
1581
1582 case SRE_OP_CATEGORY:
1583 GET_ARG;
1584 switch (arg) {
1585 case SRE_CATEGORY_DIGIT:
1586 case SRE_CATEGORY_NOT_DIGIT:
1587 case SRE_CATEGORY_SPACE:
1588 case SRE_CATEGORY_NOT_SPACE:
1589 case SRE_CATEGORY_WORD:
1590 case SRE_CATEGORY_NOT_WORD:
1591 case SRE_CATEGORY_LINEBREAK:
1592 case SRE_CATEGORY_NOT_LINEBREAK:
1593 case SRE_CATEGORY_LOC_WORD:
1594 case SRE_CATEGORY_LOC_NOT_WORD:
1595 case SRE_CATEGORY_UNI_DIGIT:
1596 case SRE_CATEGORY_UNI_NOT_DIGIT:
1597 case SRE_CATEGORY_UNI_SPACE:
1598 case SRE_CATEGORY_UNI_NOT_SPACE:
1599 case SRE_CATEGORY_UNI_WORD:
1600 case SRE_CATEGORY_UNI_NOT_WORD:
1601 case SRE_CATEGORY_UNI_LINEBREAK:
1602 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1603 break;
1604 default:
1605 FAIL;
1606 }
1607 break;
1608
1609 default:
1610 FAIL;
1611
1612 }
1613 }
1614
1615 return 1;
1616}
1617
1618static int
1619_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1620{
1621 /* Some variables are manipulated by the macros above */
1622 SRE_CODE op;
1623 SRE_CODE arg;
1624 SRE_CODE skip;
1625
1626 VTRACE(("code=%p, end=%p\n", code, end));
1627
1628 if (code > end)
1629 FAIL;
1630
1631 while (code < end) {
1632 GET_OP;
1633 switch (op) {
1634
1635 case SRE_OP_MARK:
1636 /* We don't check whether marks are properly nested; the
1637 sre_match() code is robust even if they don't, and the worst
1638 you can get is nonsensical match results. */
1639 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001640 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001641 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1642 FAIL;
1643 }
1644 break;
1645
1646 case SRE_OP_LITERAL:
1647 case SRE_OP_NOT_LITERAL:
1648 case SRE_OP_LITERAL_IGNORE:
1649 case SRE_OP_NOT_LITERAL_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001650 case SRE_OP_LITERAL_UNI_IGNORE:
1651 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001652 case SRE_OP_LITERAL_LOC_IGNORE:
1653 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001654 GET_ARG;
1655 /* The arg is just a character, nothing to check */
1656 break;
1657
1658 case SRE_OP_SUCCESS:
1659 case SRE_OP_FAILURE:
1660 /* Nothing to check; these normally end the matching process */
1661 break;
1662
1663 case SRE_OP_AT:
1664 GET_ARG;
1665 switch (arg) {
1666 case SRE_AT_BEGINNING:
1667 case SRE_AT_BEGINNING_STRING:
1668 case SRE_AT_BEGINNING_LINE:
1669 case SRE_AT_END:
1670 case SRE_AT_END_LINE:
1671 case SRE_AT_END_STRING:
1672 case SRE_AT_BOUNDARY:
1673 case SRE_AT_NON_BOUNDARY:
1674 case SRE_AT_LOC_BOUNDARY:
1675 case SRE_AT_LOC_NON_BOUNDARY:
1676 case SRE_AT_UNI_BOUNDARY:
1677 case SRE_AT_UNI_NON_BOUNDARY:
1678 break;
1679 default:
1680 FAIL;
1681 }
1682 break;
1683
1684 case SRE_OP_ANY:
1685 case SRE_OP_ANY_ALL:
1686 /* These have no operands */
1687 break;
1688
1689 case SRE_OP_IN:
1690 case SRE_OP_IN_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001691 case SRE_OP_IN_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001692 case SRE_OP_IN_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001693 GET_SKIP;
1694 /* Stop 1 before the end; we check the FAILURE below */
1695 if (!_validate_charset(code, code+skip-2))
1696 FAIL;
1697 if (code[skip-2] != SRE_OP_FAILURE)
1698 FAIL;
1699 code += skip-1;
1700 break;
1701
1702 case SRE_OP_INFO:
1703 {
1704 /* A minimal info field is
1705 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1706 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1707 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001708 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001709 SRE_CODE *newcode;
1710 GET_SKIP;
1711 newcode = code+skip-1;
1712 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001713 GET_ARG;
1714 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001715 /* Check that only valid flags are present */
1716 if ((flags & ~(SRE_INFO_PREFIX |
1717 SRE_INFO_LITERAL |
1718 SRE_INFO_CHARSET)) != 0)
1719 FAIL;
1720 /* PREFIX and CHARSET are mutually exclusive */
1721 if ((flags & SRE_INFO_PREFIX) &&
1722 (flags & SRE_INFO_CHARSET))
1723 FAIL;
1724 /* LITERAL implies PREFIX */
1725 if ((flags & SRE_INFO_LITERAL) &&
1726 !(flags & SRE_INFO_PREFIX))
1727 FAIL;
1728 /* Validate the prefix */
1729 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001730 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001731 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001732 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001733 /* Here comes the prefix string */
Benjamin Petersonca470632016-09-06 13:47:26 -07001734 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001735 FAIL;
1736 code += prefix_len;
1737 /* And here comes the overlap table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001738 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001739 FAIL;
1740 /* Each overlap value should be < prefix_len */
1741 for (i = 0; i < prefix_len; i++) {
1742 if (code[i] >= prefix_len)
1743 FAIL;
1744 }
1745 code += prefix_len;
1746 }
1747 /* Validate the charset */
1748 if (flags & SRE_INFO_CHARSET) {
1749 if (!_validate_charset(code, newcode-1))
1750 FAIL;
1751 if (newcode[-1] != SRE_OP_FAILURE)
1752 FAIL;
1753 code = newcode;
1754 }
1755 else if (code != newcode) {
1756 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1757 FAIL;
1758 }
1759 }
1760 break;
1761
1762 case SRE_OP_BRANCH:
1763 {
1764 SRE_CODE *target = NULL;
1765 for (;;) {
1766 GET_SKIP;
1767 if (skip == 0)
1768 break;
1769 /* Stop 2 before the end; we check the JUMP below */
1770 if (!_validate_inner(code, code+skip-3, groups))
1771 FAIL;
1772 code += skip-3;
1773 /* Check that it ends with a JUMP, and that each JUMP
1774 has the same target */
1775 GET_OP;
1776 if (op != SRE_OP_JUMP)
1777 FAIL;
1778 GET_SKIP;
1779 if (target == NULL)
1780 target = code+skip-1;
1781 else if (code+skip-1 != target)
1782 FAIL;
1783 }
1784 }
1785 break;
1786
1787 case SRE_OP_REPEAT_ONE:
1788 case SRE_OP_MIN_REPEAT_ONE:
1789 {
1790 SRE_CODE min, max;
1791 GET_SKIP;
1792 GET_ARG; min = arg;
1793 GET_ARG; max = arg;
1794 if (min > max)
1795 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001796 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001797 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001798 if (!_validate_inner(code, code+skip-4, groups))
1799 FAIL;
1800 code += skip-4;
1801 GET_OP;
1802 if (op != SRE_OP_SUCCESS)
1803 FAIL;
1804 }
1805 break;
1806
1807 case SRE_OP_REPEAT:
1808 {
1809 SRE_CODE min, max;
1810 GET_SKIP;
1811 GET_ARG; min = arg;
1812 GET_ARG; max = arg;
1813 if (min > max)
1814 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001815 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001816 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001817 if (!_validate_inner(code, code+skip-3, groups))
1818 FAIL;
1819 code += skip-3;
1820 GET_OP;
1821 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1822 FAIL;
1823 }
1824 break;
1825
1826 case SRE_OP_GROUPREF:
1827 case SRE_OP_GROUPREF_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001828 case SRE_OP_GROUPREF_UNI_IGNORE:
1829 case SRE_OP_GROUPREF_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001830 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001831 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001832 FAIL;
1833 break;
1834
1835 case SRE_OP_GROUPREF_EXISTS:
1836 /* The regex syntax for this is: '(?(group)then|else)', where
1837 'group' is either an integer group number or a group name,
1838 'then' and 'else' are sub-regexes, and 'else' is optional. */
1839 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001840 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001841 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001842 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001843 code--; /* The skip is relative to the first arg! */
1844 /* There are two possibilities here: if there is both a 'then'
1845 part and an 'else' part, the generated code looks like:
1846
1847 GROUPREF_EXISTS
1848 <group>
1849 <skipyes>
1850 ...then part...
1851 JUMP
1852 <skipno>
1853 (<skipyes> jumps here)
1854 ...else part...
1855 (<skipno> jumps here)
1856
1857 If there is only a 'then' part, it looks like:
1858
1859 GROUPREF_EXISTS
1860 <group>
1861 <skip>
1862 ...then part...
1863 (<skip> jumps here)
1864
1865 There is no direct way to decide which it is, and we don't want
1866 to allow arbitrary jumps anywhere in the code; so we just look
1867 for a JUMP opcode preceding our skip target.
1868 */
Benjamin Petersonca470632016-09-06 13:47:26 -07001869 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001870 code[skip-3] == SRE_OP_JUMP)
1871 {
1872 VTRACE(("both then and else parts present\n"));
1873 if (!_validate_inner(code+1, code+skip-3, groups))
1874 FAIL;
1875 code += skip-2; /* Position after JUMP, at <skipno> */
1876 GET_SKIP;
1877 if (!_validate_inner(code, code+skip-1, groups))
1878 FAIL;
1879 code += skip-1;
1880 }
1881 else {
1882 VTRACE(("only a then part present\n"));
1883 if (!_validate_inner(code+1, code+skip-1, groups))
1884 FAIL;
1885 code += skip-1;
1886 }
1887 break;
1888
1889 case SRE_OP_ASSERT:
1890 case SRE_OP_ASSERT_NOT:
1891 GET_SKIP;
1892 GET_ARG; /* 0 for lookahead, width for lookbehind */
1893 code--; /* Back up over arg to simplify math below */
1894 if (arg & 0x80000000)
1895 FAIL; /* Width too large */
1896 /* Stop 1 before the end; we check the SUCCESS below */
1897 if (!_validate_inner(code+1, code+skip-2, groups))
1898 FAIL;
1899 code += skip-2;
1900 GET_OP;
1901 if (op != SRE_OP_SUCCESS)
1902 FAIL;
1903 break;
1904
1905 default:
1906 FAIL;
1907
1908 }
1909 }
1910
1911 VTRACE(("okay\n"));
1912 return 1;
1913}
1914
1915static int
1916_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1917{
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001918 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1919 code >= end || end[-1] != SRE_OP_SUCCESS)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001920 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001921 return _validate_inner(code, end-1, groups);
1922}
1923
1924static int
1925_validate(PatternObject *self)
1926{
1927 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1928 {
1929 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1930 return 0;
1931 }
1932 else
1933 VTRACE(("Success!\n"));
1934 return 1;
1935}
1936
1937/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001938/* match methods */
1939
1940static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001941match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001942{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001943 PyTypeObject *tp = Py_TYPE(self);
1944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001945 Py_XDECREF(self->regs);
1946 Py_XDECREF(self->string);
1947 Py_DECREF(self->pattern);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001948 PyObject_Free(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001949 Py_DECREF(tp);
Guido van Rossumb700df92000-03-31 14:59:30 +00001950}
1951
1952static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001953match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001954{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001955 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001956 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001957 Py_buffer view;
1958 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001959 const void* ptr;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001960 Py_ssize_t i, j;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001961
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001962 assert(0 <= index && index < self->groups);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001963 index *= 2;
1964
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001965 if (self->string == Py_None || self->mark[index] < 0) {
1966 /* return default value if the string or group is undefined */
1967 Py_INCREF(def);
1968 return def;
1969 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001970
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001971 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001972 if (ptr == NULL)
1973 return NULL;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001974
1975 i = self->mark[index];
1976 j = self->mark[index+1];
1977 i = Py_MIN(i, length);
1978 j = Py_MIN(j, length);
1979 result = getslice(isbytes, ptr, self->string, i, j);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001980 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001981 PyBuffer_Release(&view);
1982 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001983}
1984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001985static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001986match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001987{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001988 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001989
Guido van Rossumddefaf32007-01-14 03:31:43 +00001990 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001991 /* Default value */
1992 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00001993
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001994 if (PyIndex_Check(index)) {
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001995 i = PyNumber_AsSsize_t(index, NULL);
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001996 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001997 else {
1998 i = -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001999
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002000 if (self->pattern->groupindex) {
2001 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2002 if (index && PyLong_Check(index)) {
2003 i = PyLong_AsSsize_t(index);
2004 }
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002005 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002006 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002007 if (i < 0 || i >= self->groups) {
2008 /* raise IndexError if we were given a bad group number */
2009 if (!PyErr_Occurred()) {
2010 PyErr_SetString(PyExc_IndexError, "no such group");
2011 }
2012 return -1;
2013 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002014
2015 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002016}
2017
2018static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002019match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002020{
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002021 Py_ssize_t i = match_getindex(self, index);
2022
2023 if (i < 0) {
2024 return NULL;
2025 }
2026
2027 return match_getslice_by_index(self, i, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002028}
2029
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002030/*[clinic input]
2031_sre.SRE_Match.expand
2032
2033 template: object
2034
2035Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2036[clinic start generated code]*/
2037
2038static PyObject *
2039_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2040/*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002041{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002042 /* delegate to Python code */
2043 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002044 SRE_PY_MODULE, "_expand",
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002045 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002046 );
2047}
2048
2049static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002050match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002051{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002053 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002054
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002055 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002056
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002057 switch (size) {
2058 case 0:
Victor Stinner37834132020-10-27 17:12:53 +01002059 result = match_getslice(self, _PyLong_GetZero(), Py_None);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002060 break;
2061 case 1:
2062 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2063 break;
2064 default:
2065 /* fetch multiple items */
2066 result = PyTuple_New(size);
2067 if (!result)
2068 return NULL;
2069 for (i = 0; i < size; i++) {
2070 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002071 self, PyTuple_GET_ITEM(args, i), Py_None
2072 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 if (!item) {
2074 Py_DECREF(result);
2075 return NULL;
2076 }
2077 PyTuple_SET_ITEM(result, i, item);
2078 }
2079 break;
2080 }
2081 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002082}
2083
Eric V. Smith605bdae2016-09-11 08:55:43 -04002084static PyObject*
2085match_getitem(MatchObject* self, PyObject* name)
2086{
2087 return match_getslice(self, name, Py_None);
2088}
2089
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002090/*[clinic input]
2091_sre.SRE_Match.groups
2092
2093 default: object = None
2094 Is used for groups that did not participate in the match.
2095
2096Return a tuple containing all the subgroups of the match, from 1.
2097[clinic start generated code]*/
2098
2099static PyObject *
2100_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2101/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002102{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002103 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002104 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002105
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002106 result = PyTuple_New(self->groups-1);
2107 if (!result)
2108 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002109
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002110 for (index = 1; index < self->groups; index++) {
2111 PyObject* item;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002112 item = match_getslice_by_index(self, index, default_value);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002113 if (!item) {
2114 Py_DECREF(result);
2115 return NULL;
2116 }
2117 PyTuple_SET_ITEM(result, index-1, item);
2118 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002119
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002120 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002121}
2122
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002123/*[clinic input]
2124_sre.SRE_Match.groupdict
2125
2126 default: object = None
2127 Is used for groups that did not participate in the match.
2128
2129Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2130[clinic start generated code]*/
2131
2132static PyObject *
2133_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2134/*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002135{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002136 PyObject *result;
2137 PyObject *key;
2138 PyObject *value;
2139 Py_ssize_t pos = 0;
2140 Py_hash_t hash;
Guido van Rossumb700df92000-03-31 14:59:30 +00002141
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002142 result = PyDict_New();
2143 if (!result || !self->pattern->groupindex)
2144 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002145
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002146 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002147 int status;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002148 Py_INCREF(key);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002149 value = match_getslice(self, key, default_value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002150 if (!value) {
2151 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002152 goto failed;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002153 }
2154 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002155 Py_DECREF(value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002156 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002157 if (status < 0)
2158 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002159 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002160
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002161 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002162
2163failed:
Fredrik Lundh770617b2001-01-14 15:06:11 +00002164 Py_DECREF(result);
2165 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002166}
2167
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002168/*[clinic input]
2169_sre.SRE_Match.start -> Py_ssize_t
2170
2171 group: object(c_default="NULL") = 0
2172 /
2173
2174Return index of the start of the substring matched by group.
2175[clinic start generated code]*/
2176
2177static Py_ssize_t
2178_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2179/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002180{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002181 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002182
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002183 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002184 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002185 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002186
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002187 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002188 return self->mark[index*2];
Guido van Rossumb700df92000-03-31 14:59:30 +00002189}
2190
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002191/*[clinic input]
2192_sre.SRE_Match.end -> Py_ssize_t
2193
2194 group: object(c_default="NULL") = 0
2195 /
2196
2197Return index of the end of the substring matched by group.
2198[clinic start generated code]*/
2199
2200static Py_ssize_t
2201_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2202/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002203{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002204 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002205
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002206 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002207 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002208 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002209
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002210 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002211 return self->mark[index*2+1];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002212}
2213
2214LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002215_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002216{
2217 PyObject* pair;
2218 PyObject* item;
2219
2220 pair = PyTuple_New(2);
2221 if (!pair)
2222 return NULL;
2223
Christian Heimes217cfd12007-12-02 14:31:20 +00002224 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002225 if (!item)
2226 goto error;
2227 PyTuple_SET_ITEM(pair, 0, item);
2228
Christian Heimes217cfd12007-12-02 14:31:20 +00002229 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002230 if (!item)
2231 goto error;
2232 PyTuple_SET_ITEM(pair, 1, item);
2233
2234 return pair;
2235
2236 error:
2237 Py_DECREF(pair);
2238 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002239}
2240
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002241/*[clinic input]
2242_sre.SRE_Match.span
2243
2244 group: object(c_default="NULL") = 0
2245 /
2246
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002247For match object m, return the 2-tuple (m.start(group), m.end(group)).
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002248[clinic start generated code]*/
2249
2250static PyObject *
2251_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002252/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002253{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002254 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002255
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002256 if (index < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002257 return NULL;
2258 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002259
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002260 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002261 return _pair(self->mark[index*2], self->mark[index*2+1]);
2262}
2263
2264static PyObject*
2265match_regs(MatchObject* self)
2266{
2267 PyObject* regs;
2268 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002269 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002270
2271 regs = PyTuple_New(self->groups);
2272 if (!regs)
2273 return NULL;
2274
2275 for (index = 0; index < self->groups; index++) {
2276 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2277 if (!item) {
2278 Py_DECREF(regs);
2279 return NULL;
2280 }
2281 PyTuple_SET_ITEM(regs, index, item);
2282 }
2283
2284 Py_INCREF(regs);
2285 self->regs = regs;
2286
2287 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002288}
2289
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002290/*[clinic input]
2291_sre.SRE_Match.__copy__
2292
2293[clinic start generated code]*/
2294
2295static PyObject *
2296_sre_SRE_Match___copy___impl(MatchObject *self)
2297/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002298{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002299 Py_INCREF(self);
2300 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002301}
2302
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002303/*[clinic input]
2304_sre.SRE_Match.__deepcopy__
2305
2306 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002307 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002308
2309[clinic start generated code]*/
2310
2311static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002312_sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2313/*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002314{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002315 Py_INCREF(self);
2316 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002317}
2318
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002319PyDoc_STRVAR(match_doc,
2320"The result of re.match() and re.search().\n\
2321Match objects always have a boolean value of True.");
2322
2323PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002324"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002325 Return subgroup(s) of the match by indices or names.\n\
2326 For 0 returns the entire match.");
2327
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002328static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002329match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
Guido van Rossumb700df92000-03-31 14:59:30 +00002330{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002331 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002332 return PyLong_FromSsize_t(self->lastindex);
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002333 Py_RETURN_NONE;
Guido van Rossumb700df92000-03-31 14:59:30 +00002334}
2335
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002336static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002337match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002338{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002339 if (self->pattern->indexgroup &&
2340 self->lastindex >= 0 &&
2341 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2342 {
2343 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2344 self->lastindex);
2345 Py_INCREF(result);
2346 return result;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002347 }
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002348 Py_RETURN_NONE;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002349}
2350
2351static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002352match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002353{
2354 if (self->regs) {
2355 Py_INCREF(self->regs);
2356 return self->regs;
2357 } else
2358 return match_regs(self);
2359}
2360
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002361static PyObject *
2362match_repr(MatchObject *self)
2363{
2364 PyObject *result;
2365 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2366 if (group0 == NULL)
2367 return NULL;
2368 result = PyUnicode_FromFormat(
sth8b91eda2019-03-10 11:29:14 +01002369 "<%s object; span=(%zd, %zd), match=%.50R>",
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002370 Py_TYPE(self)->tp_name,
2371 self->mark[0], self->mark[1], group0);
2372 Py_DECREF(group0);
2373 return result;
2374}
2375
2376
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002377static PyObject*
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002378pattern_new_match(_sremodulestate* module_state,
2379 PatternObject* pattern,
2380 SRE_STATE* state,
2381 Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002382{
2383 /* create match object (from state object) */
2384
2385 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002386 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002387 char* base;
2388 int n;
2389
2390 if (status > 0) {
2391
2392 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002393 /* coverity[ampersand_in_size] */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002394 match = PyObject_NewVar(MatchObject,
2395 module_state->Match_Type,
Victor Stinner92055202020-04-08 00:38:15 +02002396 2*(pattern->groups+1));
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002397 if (!match)
2398 return NULL;
2399
2400 Py_INCREF(pattern);
2401 match->pattern = pattern;
2402
2403 Py_INCREF(state->string);
2404 match->string = state->string;
2405
2406 match->regs = NULL;
2407 match->groups = pattern->groups+1;
2408
2409 /* fill in group slices */
2410
2411 base = (char*) state->beginning;
2412 n = state->charsize;
2413
2414 match->mark[0] = ((char*) state->start - base) / n;
2415 match->mark[1] = ((char*) state->ptr - base) / n;
2416
2417 for (i = j = 0; i < pattern->groups; i++, j+=2)
2418 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2419 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2420 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2421 } else
2422 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2423
2424 match->pos = state->pos;
2425 match->endpos = state->endpos;
2426
2427 match->lastindex = state->lastindex;
2428
2429 return (PyObject*) match;
2430
2431 } else if (status == 0) {
2432
2433 /* no match */
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002434 Py_RETURN_NONE;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002435
2436 }
2437
2438 /* internal error */
2439 pattern_error(status);
2440 return NULL;
2441}
2442
2443
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002444/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002445/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002446
2447static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002448scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002449{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002450 PyTypeObject *tp = Py_TYPE(self);
2451
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002452 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002453 Py_XDECREF(self->pattern);
Victor Stinner32bd68c2020-12-01 10:37:39 +01002454 PyObject_Free(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002455 Py_DECREF(tp);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002456}
2457
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002458/*[clinic input]
2459_sre.SRE_Scanner.match
2460
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002461 cls: defining_class
2462 /
2463
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002464[clinic start generated code]*/
2465
2466static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002467_sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2468/*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002469{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002470 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002471 SRE_STATE* state = &self->state;
2472 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002473 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002474
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002475 if (state->start == NULL)
2476 Py_RETURN_NONE;
2477
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002478 state_reset(state);
2479
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002480 state->ptr = state->start;
2481
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002482 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002483 if (PyErr_Occurred())
2484 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002485
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002486 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2487 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002488
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002489 if (status == 0)
2490 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002491 else {
2492 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002493 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002494 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002495
2496 return match;
2497}
2498
2499
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002500/*[clinic input]
2501_sre.SRE_Scanner.search
2502
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002503 cls: defining_class
2504 /
2505
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002506[clinic start generated code]*/
2507
2508static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002509_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2510/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002511{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002512 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002513 SRE_STATE* state = &self->state;
2514 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002515 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002516
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002517 if (state->start == NULL)
2518 Py_RETURN_NONE;
2519
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002520 state_reset(state);
2521
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002522 state->ptr = state->start;
2523
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002524 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002525 if (PyErr_Occurred())
2526 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002527
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002528 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2529 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002530
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002531 if (status == 0)
2532 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002533 else {
2534 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002535 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002536 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002537
2538 return match;
2539}
2540
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002541static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002542pattern_scanner(_sremodulestate *module_state,
2543 PatternObject *self,
2544 PyObject *string,
2545 Py_ssize_t pos,
2546 Py_ssize_t endpos)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002547{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002548 ScannerObject* scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002549
2550 /* create scanner object */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002551 scanner = PyObject_New(ScannerObject, module_state->Scanner_Type);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002552 if (!scanner)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002553 return NULL;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002554 scanner->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002555
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002556 /* create search state object */
2557 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2558 Py_DECREF(scanner);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002559 return NULL;
2560 }
2561
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002562 Py_INCREF(self);
2563 scanner->pattern = (PyObject*) self;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002564
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002565 return (PyObject*) scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002566}
2567
Victor Stinnerb44fb122016-11-21 16:35:08 +01002568static Py_hash_t
2569pattern_hash(PatternObject *self)
2570{
2571 Py_hash_t hash, hash2;
2572
2573 hash = PyObject_Hash(self->pattern);
2574 if (hash == -1) {
2575 return -1;
2576 }
2577
2578 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2579 hash ^= hash2;
2580
2581 hash ^= self->flags;
2582 hash ^= self->isbytes;
2583 hash ^= self->codesize;
2584
2585 if (hash == -1) {
2586 hash = -2;
2587 }
2588 return hash;
2589}
2590
2591static PyObject*
2592pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2593{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002594 PyTypeObject *tp = Py_TYPE(lefto);
2595 _sremodulestate *module_state = get_sre_module_state_by_class(tp);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002596 PatternObject *left, *right;
2597 int cmp;
2598
2599 if (op != Py_EQ && op != Py_NE) {
2600 Py_RETURN_NOTIMPLEMENTED;
2601 }
2602
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002603 if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2604 {
Victor Stinnerb44fb122016-11-21 16:35:08 +01002605 Py_RETURN_NOTIMPLEMENTED;
2606 }
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01002607
2608 if (lefto == righto) {
2609 /* a pattern is equal to itself */
2610 return PyBool_FromLong(op == Py_EQ);
2611 }
2612
Victor Stinnerb44fb122016-11-21 16:35:08 +01002613 left = (PatternObject *)lefto;
2614 right = (PatternObject *)righto;
2615
2616 cmp = (left->flags == right->flags
2617 && left->isbytes == right->isbytes
Victor Stinnere670b2d2016-11-22 15:23:00 +01002618 && left->codesize == right->codesize);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002619 if (cmp) {
2620 /* Compare the code and the pattern because the same pattern can
2621 produce different codes depending on the locale used to compile the
2622 pattern when the re.LOCALE flag is used. Don't compare groups,
2623 indexgroup nor groupindex: they are derivated from the pattern. */
2624 cmp = (memcmp(left->code, right->code,
2625 sizeof(left->code[0]) * left->codesize) == 0);
2626 }
2627 if (cmp) {
2628 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2629 Py_EQ);
2630 if (cmp < 0) {
2631 return NULL;
2632 }
2633 }
2634 if (op == Py_NE) {
2635 cmp = !cmp;
2636 }
2637 return PyBool_FromLong(cmp);
2638}
2639
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002640#include "clinic/_sre.c.h"
2641
2642static PyMethodDef pattern_methods[] = {
2643 _SRE_SRE_PATTERN_MATCH_METHODDEF
2644 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2645 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2646 _SRE_SRE_PATTERN_SUB_METHODDEF
2647 _SRE_SRE_PATTERN_SUBN_METHODDEF
2648 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2649 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2650 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2651 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2652 _SRE_SRE_PATTERN___COPY___METHODDEF
2653 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002654 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2655 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002656 {NULL, NULL}
2657};
2658
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002659static PyGetSetDef pattern_getset[] = {
2660 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2661 "A dictionary mapping group names to group numbers."},
2662 {NULL} /* Sentinel */
2663};
2664
2665#define PAT_OFF(x) offsetof(PatternObject, x)
2666static PyMemberDef pattern_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002667 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2668 "The pattern string from which the RE object was compiled."},
2669 {"flags", T_INT, PAT_OFF(flags), READONLY,
2670 "The regex matching flags."},
2671 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2672 "The number of capturing groups in the pattern."},
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002673 {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002674 {NULL} /* Sentinel */
2675};
2676
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002677static PyType_Slot pattern_slots[] = {
2678 {Py_tp_dealloc, (destructor)pattern_dealloc},
2679 {Py_tp_repr, (reprfunc)pattern_repr},
2680 {Py_tp_hash, (hashfunc)pattern_hash},
2681 {Py_tp_doc, (void *)pattern_doc},
2682 {Py_tp_richcompare, pattern_richcompare},
2683 {Py_tp_methods, pattern_methods},
2684 {Py_tp_members, pattern_members},
2685 {Py_tp_getset, pattern_getset},
2686 {0, NULL},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002687};
2688
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002689static PyType_Spec pattern_spec = {
2690 .name = "re.Pattern",
2691 .basicsize = sizeof(PatternObject),
2692 .itemsize = sizeof(SRE_CODE),
2693 .flags = Py_TPFLAGS_DEFAULT,
2694 .slots = pattern_slots,
Eric V. Smith605bdae2016-09-11 08:55:43 -04002695};
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002696
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002697static PyMethodDef match_methods[] = {
2698 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2699 _SRE_SRE_MATCH_START_METHODDEF
2700 _SRE_SRE_MATCH_END_METHODDEF
2701 _SRE_SRE_MATCH_SPAN_METHODDEF
2702 _SRE_SRE_MATCH_GROUPS_METHODDEF
2703 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2704 _SRE_SRE_MATCH_EXPAND_METHODDEF
2705 _SRE_SRE_MATCH___COPY___METHODDEF
2706 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002707 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2708 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002709 {NULL, NULL}
2710};
2711
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002712static PyGetSetDef match_getset[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002713 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2714 "The integer index of the last matched capturing group."},
2715 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2716 "The name of the last matched capturing group."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002717 {"regs", (getter)match_regs_get, (setter)NULL},
2718 {NULL}
2719};
2720
2721#define MATCH_OFF(x) offsetof(MatchObject, x)
2722static PyMemberDef match_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002723 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2724 "The string passed to match() or search()."},
2725 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2726 "The regular expression object."},
2727 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2728 "The index into the string at which the RE engine started looking for a match."},
2729 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2730 "The index into the string beyond which the RE engine will not go."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002731 {NULL}
2732};
2733
2734/* FIXME: implement setattr("string", None) as a special case (to
2735 detach the associated string, if any */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002736static PyType_Slot match_slots[] = {
2737 {Py_tp_dealloc, match_dealloc},
2738 {Py_tp_repr, match_repr},
2739 {Py_tp_doc, (void *)match_doc},
2740 {Py_tp_methods, match_methods},
2741 {Py_tp_members, match_members},
2742 {Py_tp_getset, match_getset},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002743
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002744 /* As mapping.
2745 *
2746 * Match objects do not support length or assignment, but do support
2747 * __getitem__.
2748 */
2749 {Py_mp_subscript, match_getitem},
2750
2751 {0, NULL},
2752};
2753
2754static PyType_Spec match_spec = {
2755 .name = "re.Match",
2756 .basicsize = sizeof(MatchObject),
2757 .itemsize = sizeof(Py_ssize_t),
2758 .flags = Py_TPFLAGS_DEFAULT,
2759 .slots = match_slots,
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002760};
2761
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002762static PyMethodDef scanner_methods[] = {
2763 _SRE_SRE_SCANNER_MATCH_METHODDEF
2764 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2765 {NULL, NULL}
2766};
2767
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002768#define SCAN_OFF(x) offsetof(ScannerObject, x)
2769static PyMemberDef scanner_members[] = {
2770 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2771 {NULL} /* Sentinel */
2772};
2773
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002774static PyType_Slot scanner_slots[] = {
2775 {Py_tp_dealloc, scanner_dealloc},
2776 {Py_tp_methods, scanner_methods},
2777 {Py_tp_members, scanner_members},
2778 {0, NULL},
2779};
2780
2781static PyType_Spec scanner_spec = {
2782 .name = "_" SRE_MODULE ".SRE_Scanner",
2783 .basicsize = sizeof(ScannerObject),
2784 .flags = Py_TPFLAGS_DEFAULT,
2785 .slots = scanner_slots,
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002786};
2787
Guido van Rossumb700df92000-03-31 14:59:30 +00002788static PyMethodDef _functions[] = {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002789 _SRE_COMPILE_METHODDEF
2790 _SRE_GETCODESIZE_METHODDEF
Serhiy Storchaka6d336a02017-05-09 23:37:14 +03002791 _SRE_ASCII_ISCASED_METHODDEF
2792 _SRE_UNICODE_ISCASED_METHODDEF
Serhiy Storchaka7186cc22017-05-05 10:42:46 +03002793 _SRE_ASCII_TOLOWER_METHODDEF
2794 _SRE_UNICODE_TOLOWER_METHODDEF
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002795 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002796};
2797
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002798static int
2799sre_traverse(PyObject *module, visitproc visit, void *arg)
2800{
2801 _sremodulestate *state = get_sre_module_state(module);
2802
2803 Py_VISIT(state->Pattern_Type);
2804 Py_VISIT(state->Match_Type);
2805 Py_VISIT(state->Scanner_Type);
2806
2807 return 0;
2808}
2809
2810static int
2811sre_clear(PyObject *module)
2812{
2813 _sremodulestate *state = get_sre_module_state(module);
2814
2815 Py_CLEAR(state->Pattern_Type);
2816 Py_CLEAR(state->Match_Type);
2817 Py_CLEAR(state->Scanner_Type);
2818
2819 return 0;
2820}
2821
2822static void
2823sre_free(void *module)
2824{
2825 sre_clear((PyObject *)module);
2826}
2827
2828#define CREATE_TYPE(m, type, spec) \
2829do { \
2830 type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2831 if (type == NULL) { \
2832 goto error; \
2833 } \
2834} while (0)
2835
2836#define ADD_ULONG_CONSTANT(module, name, value) \
2837 do { \
2838 PyObject *o = PyLong_FromUnsignedLong(value); \
2839 if (!o) \
2840 goto error; \
2841 int res = PyModule_AddObjectRef(module, name, o); \
2842 Py_DECREF(o); \
2843 if (res < 0) { \
2844 goto error; \
2845 } \
2846} while (0)
2847
2848static int
2849sre_exec(PyObject *m)
2850{
2851 _sremodulestate *state;
2852
2853 /* Create heap types */
2854 state = get_sre_module_state(m);
2855 CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2856 CREATE_TYPE(m, state->Match_Type, &match_spec);
2857 CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2858
2859 if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
2860 goto error;
2861 }
2862
2863 if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
2864 goto error;
2865 }
2866
2867 ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
2868 ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
2869
2870 if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
2871 goto error;
2872 }
2873
2874 return 0;
2875
2876error:
2877 return -1;
2878}
2879
2880static PyModuleDef_Slot sre_slots[] = {
2881 {Py_mod_exec, sre_exec},
2882 {0, NULL},
Martin v. Löwis1a214512008-06-11 05:26:20 +00002883};
2884
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002885static struct PyModuleDef sremodule = {
2886 .m_base = PyModuleDef_HEAD_INIT,
2887 .m_name = "_" SRE_MODULE,
2888 .m_size = sizeof(_sremodulestate),
2889 .m_methods = _functions,
2890 .m_slots = sre_slots,
2891 .m_traverse = sre_traverse,
2892 .m_free = sre_free,
2893 .m_clear = sre_clear,
2894};
2895
2896PyMODINIT_FUNC
2897PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002898{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002899 return PyModuleDef_Init(&sremodule);
Guido van Rossumb700df92000-03-31 14:59:30 +00002900}
2901
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002902/* vim:ts=4:sw=4:et
2903*/