blob: 0a5ca60097af365612e4f30607be420744bce6c7 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Serhiy Storchaka32eddc12013-11-23 23:20:30 +02007 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 * 2013-02-04 mrab added fullmatch primitive
Guido van Rossumb700df92000-03-31 14:59:30 +000026 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000027 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000028 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000029 * This version of the SRE library can be redistributed under CNRI's
30 * Python 1.6 license. For any other use, please contact Secret Labs
31 * AB (info@pythonware.com).
32 *
Guido van Rossumb700df92000-03-31 14:59:30 +000033 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000034 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000035 * other compatibility work.
36 */
37
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020038static const char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000039 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000040
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041#define PY_SSIZE_T_CLEAN
42
Guido van Rossumb700df92000-03-31 14:59:30 +000043#include "Python.h"
Victor Stinner37834132020-10-27 17:12:53 +010044#include "pycore_long.h" // _PyLong_GetZero()
Victor Stinner4a21e572020-04-15 02:35:41 +020045#include "structmember.h" // PyMemberDef
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030049#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
50
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000051#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000052
Fredrik Lundh436c3d582000-06-29 08:58:44 +000053/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000054#if !defined(SRE_MODULE)
55#define SRE_MODULE "sre"
56#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000057
Thomas Wouters9ada3d62006-04-21 09:47:09 +000058#define SRE_PY_MODULE "re"
59
Guido van Rossumb700df92000-03-31 14:59:30 +000060/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000061#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000062
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000063/* -------------------------------------------------------------------- */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064
Fredrik Lundh80946112000-06-29 18:03:25 +000065#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000066#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000067#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000068/* fastest possible local call under MSVC */
69#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000070#else
Benjamin Peterson791dc832017-04-20 23:52:19 -070071#define LOCAL(type) static inline type
Guido van Rossumb700df92000-03-31 14:59:30 +000072#endif
73
74/* error codes */
75#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000076#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000077#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000078#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000079#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000080
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000081#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000082#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000083#else
84#define TRACE(v)
85#endif
86
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000087/* -------------------------------------------------------------------- */
88/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +000089
Fredrik Lundh436c3d582000-06-29 08:58:44 +000090#define SRE_IS_DIGIT(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050091 ((ch) <= '9' && Py_ISDIGIT(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000092#define SRE_IS_SPACE(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050093 ((ch) <= ' ' && Py_ISSPACE(ch))
Fredrik Lundh436c3d582000-06-29 08:58:44 +000094#define SRE_IS_LINEBREAK(ch)\
Serhiy Storchaka5aa47442014-10-10 11:10:46 +030095 ((ch) == '\n')
Fredrik Lundh436c3d582000-06-29 08:58:44 +000096#define SRE_IS_WORD(ch)\
Sergey Fedoseevec014a12018-09-12 03:47:59 +050097 ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
Guido van Rossumb700df92000-03-31 14:59:30 +000098
Serhiy Storchaka3557b052017-10-24 23:31:42 +030099static unsigned int sre_lower_ascii(unsigned int ch)
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000100{
Serhiy Storchaka5aa47442014-10-10 11:10:46 +0300101 return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000102}
103
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000104/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000105/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
106 * warnings when c's type supports only numbers < N+1 */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000107#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000108#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
109
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000110static unsigned int sre_lower_locale(unsigned int ch)
111{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000112 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000113}
114
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200115static unsigned int sre_upper_locale(unsigned int ch)
116{
117 return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
118}
119
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000120/* unicode-specific character predicates */
121
Victor Stinner0058b862011-09-29 03:27:47 +0200122#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
123#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
124#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
125#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
126#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000127
128static unsigned int sre_lower_unicode(unsigned int ch)
129{
Victor Stinner0058b862011-09-29 03:27:47 +0200130 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000131}
132
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200133static unsigned int sre_upper_unicode(unsigned int ch)
134{
135 return (unsigned int) Py_UNICODE_TOUPPER(ch);
136}
137
Guido van Rossumb700df92000-03-31 14:59:30 +0000138LOCAL(int)
139sre_category(SRE_CODE category, unsigned int ch)
140{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000141 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000142
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000143 case SRE_CATEGORY_DIGIT:
144 return SRE_IS_DIGIT(ch);
145 case SRE_CATEGORY_NOT_DIGIT:
146 return !SRE_IS_DIGIT(ch);
147 case SRE_CATEGORY_SPACE:
148 return SRE_IS_SPACE(ch);
149 case SRE_CATEGORY_NOT_SPACE:
150 return !SRE_IS_SPACE(ch);
151 case SRE_CATEGORY_WORD:
152 return SRE_IS_WORD(ch);
153 case SRE_CATEGORY_NOT_WORD:
154 return !SRE_IS_WORD(ch);
155 case SRE_CATEGORY_LINEBREAK:
156 return SRE_IS_LINEBREAK(ch);
157 case SRE_CATEGORY_NOT_LINEBREAK:
158 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000159
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000160 case SRE_CATEGORY_LOC_WORD:
161 return SRE_LOC_IS_WORD(ch);
162 case SRE_CATEGORY_LOC_NOT_WORD:
163 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000165 case SRE_CATEGORY_UNI_DIGIT:
166 return SRE_UNI_IS_DIGIT(ch);
167 case SRE_CATEGORY_UNI_NOT_DIGIT:
168 return !SRE_UNI_IS_DIGIT(ch);
169 case SRE_CATEGORY_UNI_SPACE:
170 return SRE_UNI_IS_SPACE(ch);
171 case SRE_CATEGORY_UNI_NOT_SPACE:
172 return !SRE_UNI_IS_SPACE(ch);
173 case SRE_CATEGORY_UNI_WORD:
174 return SRE_UNI_IS_WORD(ch);
175 case SRE_CATEGORY_UNI_NOT_WORD:
176 return !SRE_UNI_IS_WORD(ch);
177 case SRE_CATEGORY_UNI_LINEBREAK:
178 return SRE_UNI_IS_LINEBREAK(ch);
179 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
180 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000181 }
182 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000183}
184
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300185LOCAL(int)
186char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
187{
188 return ch == pattern
189 || (SRE_CODE) sre_lower_locale(ch) == pattern
190 || (SRE_CODE) sre_upper_locale(ch) == pattern;
191}
192
193
Guido van Rossumb700df92000-03-31 14:59:30 +0000194/* helpers */
195
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000196static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000197data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000198{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000199 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000201 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000202 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000203 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000204}
205
206static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000207data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000208{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000209 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000210 minsize = state->data_stack_base+size;
211 cursize = state->data_stack_size;
212 if (cursize < minsize) {
213 void* stack;
214 cursize = minsize+minsize/4+1024;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +0200215 TRACE(("allocate/grow stack %zd\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000216 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000217 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000218 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000219 return SRE_ERROR_MEMORY;
220 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000221 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000223 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000224 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000225}
226
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000227/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000228
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300229#define SRE_CHAR Py_UCS1
230#define SIZEOF_SRE_CHAR 1
231#define SRE(F) sre_ucs1_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300232#include "sre_lib.h"
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000233
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300234/* generate 16-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000235
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300236#define SRE_CHAR Py_UCS2
237#define SIZEOF_SRE_CHAR 2
238#define SRE(F) sre_ucs2_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300239#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000240
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300241/* generate 32-bit unicode version */
242
243#define SRE_CHAR Py_UCS4
244#define SIZEOF_SRE_CHAR 4
245#define SRE(F) sre_ucs4_##F
Serhiy Storchaka8444ebb2013-10-26 11:18:42 +0300246#include "sre_lib.h"
Guido van Rossumb700df92000-03-31 14:59:30 +0000247
248/* -------------------------------------------------------------------- */
249/* factories and destructors */
250
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100251/* module state */
252typedef struct {
253 PyTypeObject *Pattern_Type;
254 PyTypeObject *Match_Type;
255 PyTypeObject *Scanner_Type;
256} _sremodulestate;
Guido van Rossumb700df92000-03-31 14:59:30 +0000257
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100258static _sremodulestate *
259get_sre_module_state(PyObject *m)
260{
261 _sremodulestate *state = (_sremodulestate *)PyModule_GetState(m);
262 assert(state);
263 return state;
264}
265
266static struct PyModuleDef sremodule;
267#define get_sre_module_state_by_class(cls) \
268 (get_sre_module_state(PyType_GetModule(cls)))
269
270/* see sre.h for object declarations */
271static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
272static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300273
274/*[clinic input]
275module _sre
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100276class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
277class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
278class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300279[clinic start generated code]*/
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100280/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
Larry Hastings2d0a69a2015-05-03 14:49:19 -0700281
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300282/*[clinic input]
283_sre.getcodesize -> int
284[clinic start generated code]*/
285
286static int
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +0300287_sre_getcodesize_impl(PyObject *module)
288/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000289{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300290 return sizeof(SRE_CODE);
Guido van Rossumb700df92000-03-31 14:59:30 +0000291}
292
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300293/*[clinic input]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300294_sre.ascii_iscased -> bool
295
296 character: int
297 /
298
299[clinic start generated code]*/
300
301static int
302_sre_ascii_iscased_impl(PyObject *module, int character)
303/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
304{
305 unsigned int ch = (unsigned int)character;
Sergey Fedoseev7f0d59f2018-09-12 17:49:09 +0500306 return ch < 128 && Py_ISALPHA(ch);
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300307}
308
309/*[clinic input]
310_sre.unicode_iscased -> bool
311
312 character: int
313 /
314
315[clinic start generated code]*/
316
317static int
318_sre_unicode_iscased_impl(PyObject *module, int character)
319/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
320{
321 unsigned int ch = (unsigned int)character;
322 return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
323}
324
325/*[clinic input]
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300326_sre.ascii_tolower -> int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300327
328 character: int
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300329 /
330
331[clinic start generated code]*/
332
333static int
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300334_sre_ascii_tolower_impl(PyObject *module, int character)
335/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000336{
Serhiy Storchaka3557b052017-10-24 23:31:42 +0300337 return sre_lower_ascii(character);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000338}
339
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300340/*[clinic input]
341_sre.unicode_tolower -> int
342
343 character: int
344 /
345
346[clinic start generated code]*/
347
348static int
349_sre_unicode_tolower_impl(PyObject *module, int character)
350/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
351{
352 return sre_lower_unicode(character);
353}
354
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000355LOCAL(void)
356state_reset(SRE_STATE* state)
357{
animalize4a7f44a2019-02-18 21:26:37 +0800358 /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000359 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000360
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000361 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000362 state->lastindex = -1;
363
364 state->repeat = NULL;
365
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000366 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000367}
368
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300369static const void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200370getstring(PyObject* string, Py_ssize_t* p_length,
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300371 int* p_isbytes, int* p_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -0600372 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +0000373{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000374 /* given a python object, return a data pointer, a length (in
375 characters), and a character size. return NULL if the object
376 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +0000377
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000378 /* Unicode objects do not support the buffer API. So, get the data
379 directly instead. */
380 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381 if (PyUnicode_READY(string) == -1)
382 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200383 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200384 *p_charsize = PyUnicode_KIND(string);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300385 *p_isbytes = 0;
386 return PyUnicode_DATA(string);
Alexandre Vassalotti70a23712007-10-14 02:05:51 +0000387 }
388
Victor Stinner0058b862011-09-29 03:27:47 +0200389 /* get pointer to byte string buffer */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300390 if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200391 PyErr_SetString(PyExc_TypeError, "expected string or bytes-like object");
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300392 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000393 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000394
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300395 *p_length = view->len;
396 *p_charsize = 1;
397 *p_isbytes = 1;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +0000398
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300399 if (view->buf == NULL) {
400 PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
401 PyBuffer_Release(view);
402 view->buf = NULL;
403 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000404 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300405 return view->buf;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000406}
407
408LOCAL(PyObject*)
409state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000410 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000411{
412 /* prepare state object */
413
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000414 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300415 int isbytes, charsize;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300416 const void* ptr;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000417
418 memset(state, 0, sizeof(SRE_STATE));
419
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300420 state->mark = PyMem_New(const void *, pattern->groups * 2);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300421 if (!state->mark) {
422 PyErr_NoMemory();
423 goto err;
424 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000425 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000426 state->lastindex = -1;
427
Benjamin Petersone48944b2012-03-07 14:50:25 -0600428 state->buffer.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300429 ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000430 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -0600431 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000432
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300433 if (isbytes && pattern->isbytes == 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600434 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200435 "cannot use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600436 goto err;
437 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300438 if (!isbytes && pattern->isbytes > 0) {
Benjamin Petersone48944b2012-03-07 14:50:25 -0600439 PyErr_SetString(PyExc_TypeError,
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200440 "cannot use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -0600441 goto err;
442 }
Antoine Pitroufd036452008-08-19 17:56:33 +0000443
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000444 /* adjust boundaries */
445 if (start < 0)
446 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000447 else if (start > length)
448 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +0000449
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000450 if (end < 0)
451 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000452 else if (end > length)
453 end = length;
454
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300455 state->isbytes = isbytes;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +0000456 state->charsize = charsize;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200457 state->match_all = 0;
458 state->must_advance = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000459
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000460 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +0000461
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000462 state->start = (void*) ((char*) ptr + start * state->charsize);
463 state->end = (void*) ((char*) ptr + end * state->charsize);
464
465 Py_INCREF(string);
466 state->string = string;
467 state->pos = start;
468 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +0000469
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000470 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600471 err:
Ammar Askar06e3a272020-06-01 17:21:43 +0000472 /* We add an explicit cast here because MSVC has a bug when
473 compiling C code where it believes that `const void**` cannot be
474 safely casted to `void*`, see bpo-39943 for details. */
475 PyMem_Del((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300476 state->mark = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -0600477 if (state->buffer.buf)
478 PyBuffer_Release(&state->buffer);
479 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000480}
481
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000482LOCAL(void)
483state_fini(SRE_STATE* state)
484{
Benjamin Petersone48944b2012-03-07 14:50:25 -0600485 if (state->buffer.buf)
486 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000487 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000488 data_stack_dealloc(state);
Ammar Askar06e3a272020-06-01 17:21:43 +0000489 /* See above PyMem_Del for why we explicitly cast here. */
490 PyMem_Del((void*) state->mark);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300491 state->mark = NULL;
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000492}
493
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000494/* calculate offset from start of string */
495#define STATE_OFFSET(state, member)\
496 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
497
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000498LOCAL(PyObject*)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300499getslice(int isbytes, const void *ptr,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300500 PyObject* string, Py_ssize_t start, Py_ssize_t end)
501{
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300502 if (isbytes) {
Serhiy Storchaka25324972013-10-16 12:46:28 +0300503 if (PyBytes_CheckExact(string) &&
504 start == 0 && end == PyBytes_GET_SIZE(string)) {
505 Py_INCREF(string);
506 return string;
507 }
508 return PyBytes_FromStringAndSize(
509 (const char *)ptr + start, end - start);
510 }
511 else {
512 return PyUnicode_Substring(string, start, end);
513 }
514}
515
516LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000517state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000518{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000519 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +0000520
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000521 index = (index - 1) * 2;
522
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000523 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000524 if (empty)
525 /* want empty string */
526 i = j = 0;
527 else {
Serhiy Storchaka228b12e2017-01-23 09:47:21 +0200528 Py_RETURN_NONE;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000529 }
Fredrik Lundh58100642000-08-09 09:14:35 +0000530 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000531 i = STATE_OFFSET(state, state->mark[index]);
532 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000533 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000534
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300535 return getslice(state->isbytes, state->beginning, string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000536}
537
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000538static void
Victor Stinnerf5587782013-11-15 23:21:11 +0100539pattern_error(Py_ssize_t status)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000540{
541 switch (status) {
542 case SRE_ERROR_RECURSION_LIMIT:
Yury Selivanovf488fb42015-07-03 01:04:23 -0400543 /* This error code seems to be unused. */
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 PyErr_SetString(
Yury Selivanovf488fb42015-07-03 01:04:23 -0400545 PyExc_RecursionError,
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 "maximum recursion limit exceeded"
547 );
548 break;
549 case SRE_ERROR_MEMORY:
550 PyErr_NoMemory();
551 break;
Christian Heimes2380ac72008-01-09 00:17:24 +0000552 case SRE_ERROR_INTERRUPTED:
553 /* An exception has already been raised, so let it fly */
554 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000555 default:
556 /* other error codes indicate compiler/engine bugs */
557 PyErr_SetString(
558 PyExc_RuntimeError,
559 "internal error in regular expression engine"
560 );
561 }
562}
563
Guido van Rossumb700df92000-03-31 14:59:30 +0000564static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000565pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +0000566{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100567 PyTypeObject *tp = Py_TYPE(self);
568
Raymond Hettinger027bb632004-05-31 03:09:25 +0000569 if (self->weakreflist != NULL)
570 PyObject_ClearWeakRefs((PyObject *) self);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000571 Py_XDECREF(self->pattern);
572 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +0000573 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000574 PyObject_DEL(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100575 Py_DECREF(tp);
Guido van Rossumb700df92000-03-31 14:59:30 +0000576}
577
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300578LOCAL(Py_ssize_t)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200579sre_match(SRE_STATE* state, SRE_CODE* pattern)
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300580{
581 if (state->charsize == 1)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200582 return sre_ucs1_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300583 if (state->charsize == 2)
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200584 return sre_ucs2_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300585 assert(state->charsize == 4);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200586 return sre_ucs4_match(state, pattern, 1);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300587}
588
589LOCAL(Py_ssize_t)
590sre_search(SRE_STATE* state, SRE_CODE* pattern)
591{
592 if (state->charsize == 1)
593 return sre_ucs1_search(state, pattern);
594 if (state->charsize == 2)
595 return sre_ucs2_search(state, pattern);
596 assert(state->charsize == 4);
597 return sre_ucs4_search(state, pattern);
598}
599
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300600/*[clinic input]
601_sre.SRE_Pattern.match
602
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100603 cls: defining_class
604 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200605 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300606 pos: Py_ssize_t = 0
607 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300608
609Matches zero or more characters at the beginning of the string.
610[clinic start generated code]*/
611
Larry Hastings16c51912014-01-07 11:53:01 -0800612static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100613_sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
614 PyObject *string, Py_ssize_t pos,
615 Py_ssize_t endpos)
616/*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
Larry Hastings16c51912014-01-07 11:53:01 -0800617{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100618 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000619 SRE_STATE state;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100620 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300621 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000622
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300623 if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000624 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000625
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000626 state.ptr = state.start;
627
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000628 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
629
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200630 status = sre_match(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000631
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000632 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300633 if (PyErr_Occurred()) {
634 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000635 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300636 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000637
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100638 match = pattern_new_match(module_state, self, &state, status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000639 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300640 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000641}
642
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300643/*[clinic input]
644_sre.SRE_Pattern.fullmatch
645
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100646 cls: defining_class
647 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200648 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300649 pos: Py_ssize_t = 0
650 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300651
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +0300652Matches against all of the string.
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300653[clinic start generated code]*/
654
655static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100656_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
657 PyObject *string, Py_ssize_t pos,
658 Py_ssize_t endpos)
659/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200660{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100661 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200662 SRE_STATE state;
663 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300664 PyObject *match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200665
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300666 if (!state_init(&state, self, string, pos, endpos))
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200667 return NULL;
668
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200669 state.ptr = state.start;
670
671 TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
672
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200673 state.match_all = 1;
674 status = sre_match(&state, PatternObject_GetCode(self));
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200675
676 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300677 if (PyErr_Occurred()) {
678 state_fini(&state);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200679 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300680 }
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200681
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100682 match = pattern_new_match(module_state, self, &state, status);
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200683 state_fini(&state);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300684 return match;
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200685}
686
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300687/*[clinic input]
688_sre.SRE_Pattern.search
689
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100690 cls: defining_class
691 /
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200692 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300693 pos: Py_ssize_t = 0
694 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300695
696Scan through string looking for a match, and return a corresponding match object instance.
697
698Return None if no position in the string matches.
699[clinic start generated code]*/
700
701static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100702_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
703 PyObject *string, Py_ssize_t pos,
704 Py_ssize_t endpos)
705/*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000706{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100707 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000708 SRE_STATE state;
Victor Stinnerf5587782013-11-15 23:21:11 +0100709 Py_ssize_t status;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300710 PyObject *match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000711
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300712 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000713 return NULL;
714
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000715 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
716
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300717 status = sre_search(&state, PatternObject_GetCode(self));
Guido van Rossumb700df92000-03-31 14:59:30 +0000718
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000719 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
720
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300721 if (PyErr_Occurred()) {
722 state_fini(&state);
Thomas Wouters89f507f2006-12-13 04:49:30 +0000723 return NULL;
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300724 }
Thomas Wouters89f507f2006-12-13 04:49:30 +0000725
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100726 match = pattern_new_match(module_state, self, &state, status);
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300727 state_fini(&state);
728 return match;
Guido van Rossumb700df92000-03-31 14:59:30 +0000729}
730
731static PyObject*
Serhiy Storchakaef1585e2015-12-25 20:01:53 +0200732call(const char* module, const char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000733{
734 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000735 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000736 PyObject* func;
737 PyObject* result;
738
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000739 if (!args)
740 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +0000741 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000742 if (!name)
743 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000744 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000745 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000746 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000747 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +0000748 func = PyObject_GetAttrString(mod, function);
749 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000750 if (!func)
751 return NULL;
752 result = PyObject_CallObject(func, args);
753 Py_DECREF(func);
754 Py_DECREF(args);
755 return result;
756}
757
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300758/*[clinic input]
759_sre.SRE_Pattern.findall
760
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200761 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300762 pos: Py_ssize_t = 0
763 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300764
765Return a list of all non-overlapping matches of pattern in string.
766[clinic start generated code]*/
767
768static PyObject *
769_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200770 Py_ssize_t pos, Py_ssize_t endpos)
771/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
Guido van Rossumb700df92000-03-31 14:59:30 +0000772{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000773 SRE_STATE state;
774 PyObject* list;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100775 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000776 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +0000777
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300778 if (!state_init(&state, self, string, pos, endpos))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000779 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +0000780
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000781 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000782 if (!list) {
783 state_fini(&state);
784 return NULL;
785 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000786
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000787 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000788
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000789 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +0000790
Fredrik Lundhebc37b22000-10-28 19:30:41 +0000791 state_reset(&state);
792
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000793 state.ptr = state.start;
794
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300795 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300796 if (PyErr_Occurred())
797 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000798
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000799 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000800 if (status == 0)
801 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000802 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000803 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000804 }
Tim Peters3d563502006-01-21 02:47:53 +0000805
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000806 /* don't bother to build a match object */
807 switch (self->groups) {
808 case 0:
809 b = STATE_OFFSET(&state, state.start);
810 e = STATE_OFFSET(&state, state.ptr);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300811 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +0300812 string, b, e);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000813 if (!item)
814 goto error;
815 break;
816 case 1:
817 item = state_getslice(&state, 1, string, 1);
818 if (!item)
819 goto error;
820 break;
821 default:
822 item = PyTuple_New(self->groups);
823 if (!item)
824 goto error;
825 for (i = 0; i < self->groups; i++) {
826 PyObject* o = state_getslice(&state, i+1, string, 1);
827 if (!o) {
828 Py_DECREF(item);
829 goto error;
830 }
831 PyTuple_SET_ITEM(item, i, o);
832 }
833 break;
834 }
835
836 status = PyList_Append(list, item);
837 Py_DECREF(item);
838 if (status < 0)
839 goto error;
840
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200841 state.must_advance = (state.ptr == state.start);
842 state.start = state.ptr;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000843 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000844
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000845 state_fini(&state);
846 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +0000847
848error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +0000849 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000850 state_fini(&state);
851 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +0000852
Guido van Rossumb700df92000-03-31 14:59:30 +0000853}
854
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300855/*[clinic input]
856_sre.SRE_Pattern.finditer
857
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100858 cls: defining_class
859 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300860 string: object
861 pos: Py_ssize_t = 0
862 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
863
864Return an iterator over all non-overlapping matches for the RE pattern in string.
865
866For each match, the iterator returns a match object.
867[clinic start generated code]*/
868
869static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100870_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
871 PyObject *string, Py_ssize_t pos,
872 Py_ssize_t endpos)
873/*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
Fredrik Lundh703ce812001-10-24 22:16:30 +0000874{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100875 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000876 PyObject* scanner;
877 PyObject* search;
878 PyObject* iterator;
879
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100880 scanner = pattern_scanner(module_state, self, string, pos, endpos);
Fredrik Lundh703ce812001-10-24 22:16:30 +0000881 if (!scanner)
882 return NULL;
883
884 search = PyObject_GetAttrString(scanner, "search");
885 Py_DECREF(scanner);
886 if (!search)
887 return NULL;
888
889 iterator = PyCallIter_New(search, Py_None);
890 Py_DECREF(search);
891
892 return iterator;
893}
Fredrik Lundh703ce812001-10-24 22:16:30 +0000894
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300895/*[clinic input]
896_sre.SRE_Pattern.scanner
897
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100898 cls: defining_class
899 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300900 string: object
901 pos: Py_ssize_t = 0
902 endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
903
904[clinic start generated code]*/
905
906static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100907_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
908 PyObject *string, Py_ssize_t pos,
909 Py_ssize_t endpos)
910/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300911{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +0100912 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
913
914 return pattern_scanner(module_state, self, string, pos, endpos);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300915}
916
917/*[clinic input]
918_sre.SRE_Pattern.split
919
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200920 string: object
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300921 maxsplit: Py_ssize_t = 0
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300922
923Split string by the occurrences of pattern.
924[clinic start generated code]*/
925
926static PyObject *
927_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
Serhiy Storchakab37f3f62017-01-13 08:53:58 +0200928 Py_ssize_t maxsplit)
929/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000930{
931 SRE_STATE state;
932 PyObject* list;
933 PyObject* item;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +0100934 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000935 Py_ssize_t n;
936 Py_ssize_t i;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300937 const void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000938
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200939 assert(self->codesize != 0);
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200940
Serhiy Storchakaa860aea2015-05-03 15:54:23 +0300941 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000942 return NULL;
943
944 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +0000945 if (!list) {
946 state_fini(&state);
947 return NULL;
948 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000949
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000950 n = 0;
951 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000952
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000953 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000954
955 state_reset(&state);
956
957 state.ptr = state.start;
958
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300959 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300960 if (PyErr_Occurred())
961 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +0000962
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000963 if (status <= 0) {
964 if (status == 0)
965 break;
966 pattern_error(status);
967 goto error;
968 }
Tim Peters3d563502006-01-21 02:47:53 +0000969
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000970 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300971 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000972 string, STATE_OFFSET(&state, last),
973 STATE_OFFSET(&state, state.start)
974 );
975 if (!item)
976 goto error;
977 status = PyList_Append(list, item);
978 Py_DECREF(item);
979 if (status < 0)
980 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000981
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000982 /* add groups (if any) */
983 for (i = 0; i < self->groups; i++) {
984 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000985 if (!item)
986 goto error;
987 status = PyList_Append(list, item);
988 Py_DECREF(item);
989 if (status < 0)
990 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000991 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000992
993 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200994 state.must_advance = (state.ptr == state.start);
Fredrik Lundhbec95b92001-10-21 16:47:57 +0000995 last = state.start = state.ptr;
996
Fredrik Lundh971e78b2001-10-20 17:48:46 +0000997 }
998
Fredrik Lundhf864aa82001-10-22 06:01:56 +0000999 /* get segment following last match (even if empty) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001000 item = getslice(state.isbytes, state.beginning,
Fredrik Lundhf864aa82001-10-22 06:01:56 +00001001 string, STATE_OFFSET(&state, last), state.endpos
1002 );
1003 if (!item)
1004 goto error;
1005 status = PyList_Append(list, item);
1006 Py_DECREF(item);
1007 if (status < 0)
1008 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001009
1010 state_fini(&state);
1011 return list;
1012
1013error:
1014 Py_DECREF(list);
1015 state_fini(&state);
1016 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001017
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001018}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001019
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001020static PyObject*
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001021pattern_subx(_sremodulestate* module_state,
1022 PatternObject* self,
1023 PyObject* ptemplate,
1024 PyObject* string,
1025 Py_ssize_t count,
1026 Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001027{
1028 SRE_STATE state;
1029 PyObject* list;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001030 PyObject* joiner;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001031 PyObject* item;
1032 PyObject* filter;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001033 PyObject* match;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001034 const void* ptr;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01001035 Py_ssize_t status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001036 Py_ssize_t n;
1037 Py_ssize_t i, b, e;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001038 int isbytes, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001039 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001040 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001042 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001043 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001044 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001045 Py_INCREF(filter);
1046 filter_is_callable = 1;
1047 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001048 /* if not callable, check if it's a literal string */
1049 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001050 view.buf = NULL;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001051 ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001052 if (ptr) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001053 if (charsize == 1)
1054 literal = memchr(ptr, '\\', n) == NULL;
1055 else
1056 literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001057 } else {
1058 PyErr_Clear();
1059 literal = 0;
1060 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06001061 if (view.buf)
1062 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001063 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001064 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001065 Py_INCREF(filter);
1066 filter_is_callable = 0;
1067 } else {
1068 /* not a literal; hand it over to the template compiler */
1069 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00001070 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001071 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001072 );
1073 if (!filter)
1074 return NULL;
1075 filter_is_callable = PyCallable_Check(filter);
1076 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00001077 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001078
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001079 if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001080 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001081 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00001082 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001083
1084 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001085 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00001086 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00001087 state_fini(&state);
1088 return NULL;
1089 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001090
1091 n = i = 0;
1092
1093 while (!count || n < count) {
1094
1095 state_reset(&state);
1096
1097 state.ptr = state.start;
1098
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001099 status = sre_search(&state, PatternObject_GetCode(self));
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001100 if (PyErr_Occurred())
1101 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001102
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001103 if (status <= 0) {
1104 if (status == 0)
1105 break;
1106 pattern_error(status);
1107 goto error;
1108 }
Tim Peters3d563502006-01-21 02:47:53 +00001109
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001110 b = STATE_OFFSET(&state, state.start);
1111 e = STATE_OFFSET(&state, state.ptr);
1112
1113 if (i < b) {
1114 /* get segment before this match */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001115 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001116 string, i, b);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001117 if (!item)
1118 goto error;
1119 status = PyList_Append(list, item);
1120 Py_DECREF(item);
1121 if (status < 0)
1122 goto error;
1123
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001124 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001125
1126 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00001127 /* pass match object through filter */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001128 match = pattern_new_match(module_state, self, &state, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001129 if (!match)
1130 goto error;
Petr Viktorinffd97532020-02-11 17:46:57 +01001131 item = PyObject_CallOneArg(filter, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001132 Py_DECREF(match);
1133 if (!item)
1134 goto error;
1135 } else {
1136 /* filter is literal string */
1137 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001138 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001139 }
1140
1141 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001142 if (item != Py_None) {
1143 status = PyList_Append(list, item);
1144 Py_DECREF(item);
1145 if (status < 0)
1146 goto error;
1147 }
Tim Peters3d563502006-01-21 02:47:53 +00001148
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001149 i = e;
1150 n = n + 1;
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001151 state.must_advance = (state.ptr == state.start);
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001152 state.start = state.ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001153 }
1154
1155 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00001156 if (i < state.endpos) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001157 item = getslice(state.isbytes, state.beginning,
Serhiy Storchaka25324972013-10-16 12:46:28 +03001158 string, i, state.endpos);
Fredrik Lundhdac58492001-10-21 21:48:30 +00001159 if (!item)
1160 goto error;
1161 status = PyList_Append(list, item);
1162 Py_DECREF(item);
1163 if (status < 0)
1164 goto error;
1165 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001166
1167 state_fini(&state);
1168
Guido van Rossum4e173842001-12-07 04:25:10 +00001169 Py_DECREF(filter);
1170
Fredrik Lundhdac58492001-10-21 21:48:30 +00001171 /* convert list to single string (also removes list) */
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001172 joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001173 if (!joiner) {
1174 Py_DECREF(list);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001175 return NULL;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001176 }
1177 if (PyList_GET_SIZE(list) == 0) {
1178 Py_DECREF(list);
1179 item = joiner;
1180 }
1181 else {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001182 if (state.isbytes)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001183 item = _PyBytes_Join(joiner, list);
1184 else
1185 item = PyUnicode_Join(joiner, list);
1186 Py_DECREF(joiner);
Brett Cannonbaced562013-10-18 14:03:16 -04001187 Py_DECREF(list);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001188 if (!item)
1189 return NULL;
1190 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001191
1192 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001193 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001194
1195 return item;
1196
1197error:
1198 Py_DECREF(list);
1199 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00001200 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001201 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00001202
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001203}
1204
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001205/*[clinic input]
1206_sre.SRE_Pattern.sub
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001207
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001208 cls: defining_class
1209 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001210 repl: object
1211 string: object
1212 count: Py_ssize_t = 0
1213
1214Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1215[clinic start generated code]*/
1216
1217static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001218_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1219 PyObject *repl, PyObject *string, Py_ssize_t count)
1220/*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001221{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001222 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1223
1224 return pattern_subx(module_state, self, repl, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001225}
1226
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001227/*[clinic input]
1228_sre.SRE_Pattern.subn
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001229
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001230 cls: defining_class
1231 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001232 repl: object
1233 string: object
1234 count: Py_ssize_t = 0
1235
1236Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1237[clinic start generated code]*/
1238
1239static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001240_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1241 PyObject *repl, PyObject *string,
1242 Py_ssize_t count)
1243/*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001244{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001245 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1246
1247 return pattern_subx(module_state, self, repl, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001248}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001249
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001250/*[clinic input]
1251_sre.SRE_Pattern.__copy__
1252
1253[clinic start generated code]*/
1254
1255static PyObject *
1256_sre_SRE_Pattern___copy___impl(PatternObject *self)
1257/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001258{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001259 Py_INCREF(self);
1260 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001261}
1262
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001263/*[clinic input]
1264_sre.SRE_Pattern.__deepcopy__
1265
1266 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001267 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001268
1269[clinic start generated code]*/
1270
1271static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001272_sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
1273/*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001274{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001275 Py_INCREF(self);
1276 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00001277}
1278
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001279static PyObject *
1280pattern_repr(PatternObject *obj)
1281{
1282 static const struct {
1283 const char *name;
1284 int value;
1285 } flag_names[] = {
1286 {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
1287 {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1288 {"re.LOCALE", SRE_FLAG_LOCALE},
1289 {"re.MULTILINE", SRE_FLAG_MULTILINE},
1290 {"re.DOTALL", SRE_FLAG_DOTALL},
1291 {"re.UNICODE", SRE_FLAG_UNICODE},
1292 {"re.VERBOSE", SRE_FLAG_VERBOSE},
1293 {"re.DEBUG", SRE_FLAG_DEBUG},
1294 {"re.ASCII", SRE_FLAG_ASCII},
1295 };
1296 PyObject *result = NULL;
1297 PyObject *flag_items;
Victor Stinner706768c2014-08-16 01:03:39 +02001298 size_t i;
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001299 int flags = obj->flags;
1300
1301 /* Omit re.UNICODE for valid string patterns. */
1302 if (obj->isbytes == 0 &&
1303 (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1304 SRE_FLAG_UNICODE)
1305 flags &= ~SRE_FLAG_UNICODE;
1306
1307 flag_items = PyList_New(0);
1308 if (!flag_items)
1309 return NULL;
1310
1311 for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1312 if (flags & flag_names[i].value) {
1313 PyObject *item = PyUnicode_FromString(flag_names[i].name);
1314 if (!item)
1315 goto done;
1316
1317 if (PyList_Append(flag_items, item) < 0) {
1318 Py_DECREF(item);
1319 goto done;
1320 }
1321 Py_DECREF(item);
1322 flags &= ~flag_names[i].value;
1323 }
1324 }
1325 if (flags) {
1326 PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1327 if (!item)
1328 goto done;
1329
1330 if (PyList_Append(flag_items, item) < 0) {
1331 Py_DECREF(item);
1332 goto done;
1333 }
1334 Py_DECREF(item);
1335 }
1336
1337 if (PyList_Size(flag_items) > 0) {
1338 PyObject *flags_result;
1339 PyObject *sep = PyUnicode_FromString("|");
1340 if (!sep)
1341 goto done;
1342 flags_result = PyUnicode_Join(sep, flag_items);
1343 Py_DECREF(sep);
1344 if (!flags_result)
1345 goto done;
1346 result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1347 obj->pattern, flags_result);
1348 Py_DECREF(flags_result);
1349 }
1350 else {
1351 result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1352 }
1353
1354done:
1355 Py_DECREF(flag_items);
1356 return result;
1357}
1358
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001359PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
Raymond Hettinger94478742004-09-24 04:31:19 +00001360
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001361/* PatternObject's 'groupindex' method. */
1362static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02001363pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001364{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001365 if (self->groupindex == NULL)
1366 return PyDict_New();
Serhiy Storchaka07360df2015-03-30 01:01:48 +03001367 return PyDictProxy_New(self->groupindex);
1368}
1369
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001370static int _validate(PatternObject *self); /* Forward */
1371
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001372/*[clinic input]
1373_sre.compile
1374
1375 pattern: object
1376 flags: int
1377 code: object(subclass_of='&PyList_Type')
1378 groups: Py_ssize_t
Victor Stinner726a57d2016-11-22 23:04:39 +01001379 groupindex: object(subclass_of='&PyDict_Type')
1380 indexgroup: object(subclass_of='&PyTuple_Type')
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001381
1382[clinic start generated code]*/
1383
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001384static PyObject *
Serhiy Storchaka1a2b24f2016-07-07 17:35:15 +03001385_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03001386 PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1387 PyObject *indexgroup)
Victor Stinner726a57d2016-11-22 23:04:39 +01001388/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389{
1390 /* "compile" pattern descriptor to pattern object */
1391
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001392 _sremodulestate *module_state = get_sre_module_state(module);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001393 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001394 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001395
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001396 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00001397 /* coverity[ampersand_in_size] */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001398 self = PyObject_NewVar(PatternObject, module_state->Pattern_Type, n);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001399 if (!self)
1400 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001401 self->weakreflist = NULL;
1402 self->pattern = NULL;
1403 self->groupindex = NULL;
1404 self->indexgroup = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001405
1406 self->codesize = n;
1407
1408 for (i = 0; i < n; i++) {
1409 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00001410 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001411 self->code[i] = (SRE_CODE) value;
1412 if ((unsigned long) self->code[i] != value) {
1413 PyErr_SetString(PyExc_OverflowError,
1414 "regular expression code size limit exceeded");
1415 break;
1416 }
1417 }
1418
1419 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001420 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001421 return NULL;
1422 }
1423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 if (pattern == Py_None) {
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001425 self->isbytes = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01001426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 else {
1428 Py_ssize_t p_length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001429 int charsize;
1430 Py_buffer view;
1431 view.buf = NULL;
1432 if (!getstring(pattern, &p_length, &self->isbytes,
1433 &charsize, &view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 Py_DECREF(self);
1435 return NULL;
1436 }
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001437 if (view.buf)
1438 PyBuffer_Release(&view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001440
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001441 Py_INCREF(pattern);
1442 self->pattern = pattern;
1443
1444 self->flags = flags;
1445
1446 self->groups = groups;
1447
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03001448 if (PyDict_GET_SIZE(groupindex) > 0) {
1449 Py_INCREF(groupindex);
1450 self->groupindex = groupindex;
1451 if (PyTuple_GET_SIZE(indexgroup) > 0) {
1452 Py_INCREF(indexgroup);
1453 self->indexgroup = indexgroup;
1454 }
1455 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001456
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001457 if (!_validate(self)) {
1458 Py_DECREF(self);
1459 return NULL;
1460 }
1461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462 return (PyObject*) self;
1463}
1464
Guido van Rossumb700df92000-03-31 14:59:30 +00001465/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001466/* Code validation */
1467
1468/* To learn more about this code, have a look at the _compile() function in
1469 Lib/sre_compile.py. The validation functions below checks the code array
1470 for conformance with the code patterns generated there.
1471
1472 The nice thing about the generated code is that it is position-independent:
1473 all jumps are relative jumps forward. Also, jumps don't cross each other:
1474 the target of a later jump is always earlier than the target of an earlier
1475 jump. IOW, this is okay:
1476
1477 J---------J-------T--------T
1478 \ \_____/ /
1479 \______________________/
1480
1481 but this is not:
1482
1483 J---------J-------T--------T
1484 \_________\_____/ /
1485 \____________/
1486
Serhiy Storchakaefa5a392013-10-27 08:04:58 +02001487 It also helps that SRE_CODE is always an unsigned type.
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001488*/
1489
1490/* Defining this one enables tracing of the validator */
1491#undef VVERBOSE
1492
1493/* Trace macro for the validator */
1494#if defined(VVERBOSE)
1495#define VTRACE(v) printf v
1496#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08001497#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001498#endif
1499
1500/* Report failure */
1501#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
1502
1503/* Extract opcode, argument, or skip count from code array */
1504#define GET_OP \
1505 do { \
1506 VTRACE(("%p: ", code)); \
1507 if (code >= end) FAIL; \
1508 op = *code++; \
1509 VTRACE(("%lu (op)\n", (unsigned long)op)); \
1510 } while (0)
1511#define GET_ARG \
1512 do { \
1513 VTRACE(("%p= ", code)); \
1514 if (code >= end) FAIL; \
1515 arg = *code++; \
1516 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
1517 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001518#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001519 do { \
1520 VTRACE(("%p= ", code)); \
1521 if (code >= end) FAIL; \
1522 skip = *code; \
1523 VTRACE(("%lu (skip to %p)\n", \
1524 (unsigned long)skip, code+skip)); \
Benjamin Petersonca470632016-09-06 13:47:26 -07001525 if (skip-adj > (uintptr_t)(end - code)) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001526 FAIL; \
1527 code++; \
1528 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001529#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001530
1531static int
1532_validate_charset(SRE_CODE *code, SRE_CODE *end)
1533{
1534 /* Some variables are manipulated by the macros above */
1535 SRE_CODE op;
1536 SRE_CODE arg;
1537 SRE_CODE offset;
1538 int i;
1539
1540 while (code < end) {
1541 GET_OP;
1542 switch (op) {
1543
1544 case SRE_OP_NEGATE:
1545 break;
1546
1547 case SRE_OP_LITERAL:
1548 GET_ARG;
1549 break;
1550
1551 case SRE_OP_RANGE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001552 case SRE_OP_RANGE_UNI_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001553 GET_ARG;
1554 GET_ARG;
1555 break;
1556
1557 case SRE_OP_CHARSET:
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001558 offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
Benjamin Petersonca470632016-09-06 13:47:26 -07001559 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001560 FAIL;
1561 code += offset;
1562 break;
1563
1564 case SRE_OP_BIGCHARSET:
1565 GET_ARG; /* Number of blocks */
1566 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001567 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001568 FAIL;
1569 /* Make sure that each byte points to a valid block */
1570 for (i = 0; i < 256; i++) {
1571 if (((unsigned char *)code)[i] >= arg)
1572 FAIL;
1573 }
1574 code += offset;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001575 offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
Benjamin Petersonca470632016-09-06 13:47:26 -07001576 if (offset > (uintptr_t)(end - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001577 FAIL;
1578 code += offset;
1579 break;
1580
1581 case SRE_OP_CATEGORY:
1582 GET_ARG;
1583 switch (arg) {
1584 case SRE_CATEGORY_DIGIT:
1585 case SRE_CATEGORY_NOT_DIGIT:
1586 case SRE_CATEGORY_SPACE:
1587 case SRE_CATEGORY_NOT_SPACE:
1588 case SRE_CATEGORY_WORD:
1589 case SRE_CATEGORY_NOT_WORD:
1590 case SRE_CATEGORY_LINEBREAK:
1591 case SRE_CATEGORY_NOT_LINEBREAK:
1592 case SRE_CATEGORY_LOC_WORD:
1593 case SRE_CATEGORY_LOC_NOT_WORD:
1594 case SRE_CATEGORY_UNI_DIGIT:
1595 case SRE_CATEGORY_UNI_NOT_DIGIT:
1596 case SRE_CATEGORY_UNI_SPACE:
1597 case SRE_CATEGORY_UNI_NOT_SPACE:
1598 case SRE_CATEGORY_UNI_WORD:
1599 case SRE_CATEGORY_UNI_NOT_WORD:
1600 case SRE_CATEGORY_UNI_LINEBREAK:
1601 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1602 break;
1603 default:
1604 FAIL;
1605 }
1606 break;
1607
1608 default:
1609 FAIL;
1610
1611 }
1612 }
1613
1614 return 1;
1615}
1616
1617static int
1618_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1619{
1620 /* Some variables are manipulated by the macros above */
1621 SRE_CODE op;
1622 SRE_CODE arg;
1623 SRE_CODE skip;
1624
1625 VTRACE(("code=%p, end=%p\n", code, end));
1626
1627 if (code > end)
1628 FAIL;
1629
1630 while (code < end) {
1631 GET_OP;
1632 switch (op) {
1633
1634 case SRE_OP_MARK:
1635 /* We don't check whether marks are properly nested; the
1636 sre_match() code is robust even if they don't, and the worst
1637 you can get is nonsensical match results. */
1638 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001639 if (arg > 2 * (size_t)groups + 1) {
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001640 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1641 FAIL;
1642 }
1643 break;
1644
1645 case SRE_OP_LITERAL:
1646 case SRE_OP_NOT_LITERAL:
1647 case SRE_OP_LITERAL_IGNORE:
1648 case SRE_OP_NOT_LITERAL_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001649 case SRE_OP_LITERAL_UNI_IGNORE:
1650 case SRE_OP_NOT_LITERAL_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001651 case SRE_OP_LITERAL_LOC_IGNORE:
1652 case SRE_OP_NOT_LITERAL_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001653 GET_ARG;
1654 /* The arg is just a character, nothing to check */
1655 break;
1656
1657 case SRE_OP_SUCCESS:
1658 case SRE_OP_FAILURE:
1659 /* Nothing to check; these normally end the matching process */
1660 break;
1661
1662 case SRE_OP_AT:
1663 GET_ARG;
1664 switch (arg) {
1665 case SRE_AT_BEGINNING:
1666 case SRE_AT_BEGINNING_STRING:
1667 case SRE_AT_BEGINNING_LINE:
1668 case SRE_AT_END:
1669 case SRE_AT_END_LINE:
1670 case SRE_AT_END_STRING:
1671 case SRE_AT_BOUNDARY:
1672 case SRE_AT_NON_BOUNDARY:
1673 case SRE_AT_LOC_BOUNDARY:
1674 case SRE_AT_LOC_NON_BOUNDARY:
1675 case SRE_AT_UNI_BOUNDARY:
1676 case SRE_AT_UNI_NON_BOUNDARY:
1677 break;
1678 default:
1679 FAIL;
1680 }
1681 break;
1682
1683 case SRE_OP_ANY:
1684 case SRE_OP_ANY_ALL:
1685 /* These have no operands */
1686 break;
1687
1688 case SRE_OP_IN:
1689 case SRE_OP_IN_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001690 case SRE_OP_IN_UNI_IGNORE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001691 case SRE_OP_IN_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001692 GET_SKIP;
1693 /* Stop 1 before the end; we check the FAILURE below */
1694 if (!_validate_charset(code, code+skip-2))
1695 FAIL;
1696 if (code[skip-2] != SRE_OP_FAILURE)
1697 FAIL;
1698 code += skip-1;
1699 break;
1700
1701 case SRE_OP_INFO:
1702 {
1703 /* A minimal info field is
1704 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
1705 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
1706 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02001707 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001708 SRE_CODE *newcode;
1709 GET_SKIP;
1710 newcode = code+skip-1;
1711 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001712 GET_ARG;
1713 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001714 /* Check that only valid flags are present */
1715 if ((flags & ~(SRE_INFO_PREFIX |
1716 SRE_INFO_LITERAL |
1717 SRE_INFO_CHARSET)) != 0)
1718 FAIL;
1719 /* PREFIX and CHARSET are mutually exclusive */
1720 if ((flags & SRE_INFO_PREFIX) &&
1721 (flags & SRE_INFO_CHARSET))
1722 FAIL;
1723 /* LITERAL implies PREFIX */
1724 if ((flags & SRE_INFO_LITERAL) &&
1725 !(flags & SRE_INFO_PREFIX))
1726 FAIL;
1727 /* Validate the prefix */
1728 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02001729 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001730 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02001731 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001732 /* Here comes the prefix string */
Benjamin Petersonca470632016-09-06 13:47:26 -07001733 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001734 FAIL;
1735 code += prefix_len;
1736 /* And here comes the overlap table */
Benjamin Petersonca470632016-09-06 13:47:26 -07001737 if (prefix_len > (uintptr_t)(newcode - code))
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001738 FAIL;
1739 /* Each overlap value should be < prefix_len */
1740 for (i = 0; i < prefix_len; i++) {
1741 if (code[i] >= prefix_len)
1742 FAIL;
1743 }
1744 code += prefix_len;
1745 }
1746 /* Validate the charset */
1747 if (flags & SRE_INFO_CHARSET) {
1748 if (!_validate_charset(code, newcode-1))
1749 FAIL;
1750 if (newcode[-1] != SRE_OP_FAILURE)
1751 FAIL;
1752 code = newcode;
1753 }
1754 else if (code != newcode) {
1755 VTRACE(("code=%p, newcode=%p\n", code, newcode));
1756 FAIL;
1757 }
1758 }
1759 break;
1760
1761 case SRE_OP_BRANCH:
1762 {
1763 SRE_CODE *target = NULL;
1764 for (;;) {
1765 GET_SKIP;
1766 if (skip == 0)
1767 break;
1768 /* Stop 2 before the end; we check the JUMP below */
1769 if (!_validate_inner(code, code+skip-3, groups))
1770 FAIL;
1771 code += skip-3;
1772 /* Check that it ends with a JUMP, and that each JUMP
1773 has the same target */
1774 GET_OP;
1775 if (op != SRE_OP_JUMP)
1776 FAIL;
1777 GET_SKIP;
1778 if (target == NULL)
1779 target = code+skip-1;
1780 else if (code+skip-1 != target)
1781 FAIL;
1782 }
1783 }
1784 break;
1785
1786 case SRE_OP_REPEAT_ONE:
1787 case SRE_OP_MIN_REPEAT_ONE:
1788 {
1789 SRE_CODE min, max;
1790 GET_SKIP;
1791 GET_ARG; min = arg;
1792 GET_ARG; max = arg;
1793 if (min > max)
1794 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001795 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001796 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001797 if (!_validate_inner(code, code+skip-4, groups))
1798 FAIL;
1799 code += skip-4;
1800 GET_OP;
1801 if (op != SRE_OP_SUCCESS)
1802 FAIL;
1803 }
1804 break;
1805
1806 case SRE_OP_REPEAT:
1807 {
1808 SRE_CODE min, max;
1809 GET_SKIP;
1810 GET_ARG; min = arg;
1811 GET_ARG; max = arg;
1812 if (min > max)
1813 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001814 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001815 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001816 if (!_validate_inner(code, code+skip-3, groups))
1817 FAIL;
1818 code += skip-3;
1819 GET_OP;
1820 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
1821 FAIL;
1822 }
1823 break;
1824
1825 case SRE_OP_GROUPREF:
1826 case SRE_OP_GROUPREF_IGNORE:
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001827 case SRE_OP_GROUPREF_UNI_IGNORE:
1828 case SRE_OP_GROUPREF_LOC_IGNORE:
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001829 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001830 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001831 FAIL;
1832 break;
1833
1834 case SRE_OP_GROUPREF_EXISTS:
1835 /* The regex syntax for this is: '(?(group)then|else)', where
1836 'group' is either an integer group number or a group name,
1837 'then' and 'else' are sub-regexes, and 'else' is optional. */
1838 GET_ARG;
Victor Stinner1fa174a2013-08-28 02:06:21 +02001839 if (arg >= (size_t)groups)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001840 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00001841 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001842 code--; /* The skip is relative to the first arg! */
1843 /* There are two possibilities here: if there is both a 'then'
1844 part and an 'else' part, the generated code looks like:
1845
1846 GROUPREF_EXISTS
1847 <group>
1848 <skipyes>
1849 ...then part...
1850 JUMP
1851 <skipno>
1852 (<skipyes> jumps here)
1853 ...else part...
1854 (<skipno> jumps here)
1855
1856 If there is only a 'then' part, it looks like:
1857
1858 GROUPREF_EXISTS
1859 <group>
1860 <skip>
1861 ...then part...
1862 (<skip> jumps here)
1863
1864 There is no direct way to decide which it is, and we don't want
1865 to allow arbitrary jumps anywhere in the code; so we just look
1866 for a JUMP opcode preceding our skip target.
1867 */
Benjamin Petersonca470632016-09-06 13:47:26 -07001868 if (skip >= 3 && skip-3 < (uintptr_t)(end - code) &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001869 code[skip-3] == SRE_OP_JUMP)
1870 {
1871 VTRACE(("both then and else parts present\n"));
1872 if (!_validate_inner(code+1, code+skip-3, groups))
1873 FAIL;
1874 code += skip-2; /* Position after JUMP, at <skipno> */
1875 GET_SKIP;
1876 if (!_validate_inner(code, code+skip-1, groups))
1877 FAIL;
1878 code += skip-1;
1879 }
1880 else {
1881 VTRACE(("only a then part present\n"));
1882 if (!_validate_inner(code+1, code+skip-1, groups))
1883 FAIL;
1884 code += skip-1;
1885 }
1886 break;
1887
1888 case SRE_OP_ASSERT:
1889 case SRE_OP_ASSERT_NOT:
1890 GET_SKIP;
1891 GET_ARG; /* 0 for lookahead, width for lookbehind */
1892 code--; /* Back up over arg to simplify math below */
1893 if (arg & 0x80000000)
1894 FAIL; /* Width too large */
1895 /* Stop 1 before the end; we check the SUCCESS below */
1896 if (!_validate_inner(code+1, code+skip-2, groups))
1897 FAIL;
1898 code += skip-2;
1899 GET_OP;
1900 if (op != SRE_OP_SUCCESS)
1901 FAIL;
1902 break;
1903
1904 default:
1905 FAIL;
1906
1907 }
1908 }
1909
1910 VTRACE(("okay\n"));
1911 return 1;
1912}
1913
1914static int
1915_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1916{
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001917 if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1918 code >= end || end[-1] != SRE_OP_SUCCESS)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001919 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00001920 return _validate_inner(code, end-1, groups);
1921}
1922
1923static int
1924_validate(PatternObject *self)
1925{
1926 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
1927 {
1928 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
1929 return 0;
1930 }
1931 else
1932 VTRACE(("Success!\n"));
1933 return 1;
1934}
1935
1936/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00001937/* match methods */
1938
1939static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001940match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001941{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001942 PyTypeObject *tp = Py_TYPE(self);
1943
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001944 Py_XDECREF(self->regs);
1945 Py_XDECREF(self->string);
1946 Py_DECREF(self->pattern);
1947 PyObject_DEL(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01001948 Py_DECREF(tp);
Guido van Rossumb700df92000-03-31 14:59:30 +00001949}
1950
1951static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001952match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00001953{
Serhiy Storchaka25324972013-10-16 12:46:28 +03001954 Py_ssize_t length;
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001955 int isbytes, charsize;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001956 Py_buffer view;
1957 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001958 const void* ptr;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001959 Py_ssize_t i, j;
Serhiy Storchaka25324972013-10-16 12:46:28 +03001960
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001961 assert(0 <= index && index < self->groups);
Fredrik Lundh6f013982000-07-03 18:44:21 +00001962 index *= 2;
1963
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001964 if (self->string == Py_None || self->mark[index] < 0) {
1965 /* return default value if the string or group is undefined */
1966 Py_INCREF(def);
1967 return def;
1968 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001969
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001970 ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
Serhiy Storchaka25324972013-10-16 12:46:28 +03001971 if (ptr == NULL)
1972 return NULL;
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02001973
1974 i = self->mark[index];
1975 j = self->mark[index+1];
1976 i = Py_MIN(i, length);
1977 j = Py_MIN(j, length);
1978 result = getslice(isbytes, ptr, self->string, i, j);
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03001979 if (isbytes && view.buf != NULL)
Serhiy Storchaka25324972013-10-16 12:46:28 +03001980 PyBuffer_Release(&view);
1981 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00001982}
1983
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001984static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001985match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00001986{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001987 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00001988
Guido van Rossumddefaf32007-01-14 03:31:43 +00001989 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001990 /* Default value */
1991 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00001992
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001993 if (PyIndex_Check(index)) {
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001994 i = PyNumber_AsSsize_t(index, NULL);
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +03001995 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001996 else {
1997 i = -1;
Guido van Rossumb700df92000-03-31 14:59:30 +00001998
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02001999 if (self->pattern->groupindex) {
2000 index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2001 if (index && PyLong_Check(index)) {
2002 i = PyLong_AsSsize_t(index);
2003 }
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002004 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002005 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002006 if (i < 0 || i >= self->groups) {
2007 /* raise IndexError if we were given a bad group number */
2008 if (!PyErr_Occurred()) {
2009 PyErr_SetString(PyExc_IndexError, "no such group");
2010 }
2011 return -1;
2012 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00002013
2014 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002015}
2016
2017static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002018match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002019{
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002020 Py_ssize_t i = match_getindex(self, index);
2021
2022 if (i < 0) {
2023 return NULL;
2024 }
2025
2026 return match_getslice_by_index(self, i, def);
Guido van Rossumb700df92000-03-31 14:59:30 +00002027}
2028
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002029/*[clinic input]
2030_sre.SRE_Match.expand
2031
2032 template: object
2033
2034Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2035[clinic start generated code]*/
2036
2037static PyObject *
2038_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2039/*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002040{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002041 /* delegate to Python code */
2042 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002043 SRE_PY_MODULE, "_expand",
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002044 PyTuple_Pack(3, self->pattern, self, template)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00002045 );
2046}
2047
2048static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002049match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00002050{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002051 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002052 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00002055
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002056 switch (size) {
2057 case 0:
Victor Stinner37834132020-10-27 17:12:53 +01002058 result = match_getslice(self, _PyLong_GetZero(), Py_None);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002059 break;
2060 case 1:
2061 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2062 break;
2063 default:
2064 /* fetch multiple items */
2065 result = PyTuple_New(size);
2066 if (!result)
2067 return NULL;
2068 for (i = 0; i < size; i++) {
2069 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00002070 self, PyTuple_GET_ITEM(args, i), Py_None
2071 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002072 if (!item) {
2073 Py_DECREF(result);
2074 return NULL;
2075 }
2076 PyTuple_SET_ITEM(result, i, item);
2077 }
2078 break;
2079 }
2080 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002081}
2082
Eric V. Smith605bdae2016-09-11 08:55:43 -04002083static PyObject*
2084match_getitem(MatchObject* self, PyObject* name)
2085{
2086 return match_getslice(self, name, Py_None);
2087}
2088
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002089/*[clinic input]
2090_sre.SRE_Match.groups
2091
2092 default: object = None
2093 Is used for groups that did not participate in the match.
2094
2095Return a tuple containing all the subgroups of the match, from 1.
2096[clinic start generated code]*/
2097
2098static PyObject *
2099_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2100/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002101{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002102 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002103 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00002104
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002105 result = PyTuple_New(self->groups-1);
2106 if (!result)
2107 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002108
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002109 for (index = 1; index < self->groups; index++) {
2110 PyObject* item;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002111 item = match_getslice_by_index(self, index, default_value);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002112 if (!item) {
2113 Py_DECREF(result);
2114 return NULL;
2115 }
2116 PyTuple_SET_ITEM(result, index-1, item);
2117 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002118
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002119 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002120}
2121
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002122/*[clinic input]
2123_sre.SRE_Match.groupdict
2124
2125 default: object = None
2126 Is used for groups that did not participate in the match.
2127
2128Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2129[clinic start generated code]*/
2130
2131static PyObject *
2132_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2133/*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002134{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002135 PyObject *result;
2136 PyObject *key;
2137 PyObject *value;
2138 Py_ssize_t pos = 0;
2139 Py_hash_t hash;
Guido van Rossumb700df92000-03-31 14:59:30 +00002140
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002141 result = PyDict_New();
2142 if (!result || !self->pattern->groupindex)
2143 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00002144
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002145 while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00002146 int status;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002147 Py_INCREF(key);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002148 value = match_getslice(self, key, default_value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002149 if (!value) {
2150 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002151 goto failed;
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002152 }
2153 status = _PyDict_SetItem_KnownHash(result, key, value, hash);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002154 Py_DECREF(value);
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002155 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00002156 if (status < 0)
2157 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002158 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002159
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002160 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00002161
2162failed:
Fredrik Lundh770617b2001-01-14 15:06:11 +00002163 Py_DECREF(result);
2164 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002165}
2166
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002167/*[clinic input]
2168_sre.SRE_Match.start -> Py_ssize_t
2169
2170 group: object(c_default="NULL") = 0
2171 /
2172
2173Return index of the start of the substring matched by group.
2174[clinic start generated code]*/
2175
2176static Py_ssize_t
2177_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2178/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002179{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002180 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002181
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002182 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002183 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002184 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002185
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002186 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002187 return self->mark[index*2];
Guido van Rossumb700df92000-03-31 14:59:30 +00002188}
2189
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002190/*[clinic input]
2191_sre.SRE_Match.end -> Py_ssize_t
2192
2193 group: object(c_default="NULL") = 0
2194 /
2195
2196Return index of the end of the substring matched by group.
2197[clinic start generated code]*/
2198
2199static Py_ssize_t
2200_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2201/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002202{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002203 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002204
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002205 if (index < 0) {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002206 return -1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002207 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002208
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002209 /* mark is -1 if group is undefined */
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002210 return self->mark[index*2+1];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002211}
2212
2213LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002214_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002215{
2216 PyObject* pair;
2217 PyObject* item;
2218
2219 pair = PyTuple_New(2);
2220 if (!pair)
2221 return NULL;
2222
Christian Heimes217cfd12007-12-02 14:31:20 +00002223 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002224 if (!item)
2225 goto error;
2226 PyTuple_SET_ITEM(pair, 0, item);
2227
Christian Heimes217cfd12007-12-02 14:31:20 +00002228 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002229 if (!item)
2230 goto error;
2231 PyTuple_SET_ITEM(pair, 1, item);
2232
2233 return pair;
2234
2235 error:
2236 Py_DECREF(pair);
2237 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002238}
2239
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002240/*[clinic input]
2241_sre.SRE_Match.span
2242
2243 group: object(c_default="NULL") = 0
2244 /
2245
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002246For match object m, return the 2-tuple (m.start(group), m.end(group)).
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002247[clinic start generated code]*/
2248
2249static PyObject *
2250_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002251/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
Guido van Rossumb700df92000-03-31 14:59:30 +00002252{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002253 Py_ssize_t index = match_getindex(self, group);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002254
Serhiy Storchakaa24107b2019-02-25 17:59:46 +02002255 if (index < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002256 return NULL;
2257 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002258
Fredrik Lundh510c97b2000-09-02 16:36:57 +00002259 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002260 return _pair(self->mark[index*2], self->mark[index*2+1]);
2261}
2262
2263static PyObject*
2264match_regs(MatchObject* self)
2265{
2266 PyObject* regs;
2267 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002268 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002269
2270 regs = PyTuple_New(self->groups);
2271 if (!regs)
2272 return NULL;
2273
2274 for (index = 0; index < self->groups; index++) {
2275 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2276 if (!item) {
2277 Py_DECREF(regs);
2278 return NULL;
2279 }
2280 PyTuple_SET_ITEM(regs, index, item);
2281 }
2282
2283 Py_INCREF(regs);
2284 self->regs = regs;
2285
2286 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00002287}
2288
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002289/*[clinic input]
2290_sre.SRE_Match.__copy__
2291
2292[clinic start generated code]*/
2293
2294static PyObject *
2295_sre_SRE_Match___copy___impl(MatchObject *self)
2296/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002297{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002298 Py_INCREF(self);
2299 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002300}
2301
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002302/*[clinic input]
2303_sre.SRE_Match.__deepcopy__
2304
2305 memo: object
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002306 /
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002307
2308[clinic start generated code]*/
2309
2310static PyObject *
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002311_sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
2312/*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002313{
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03002314 Py_INCREF(self);
2315 return (PyObject *)self;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002316}
2317
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002318PyDoc_STRVAR(match_doc,
2319"The result of re.match() and re.search().\n\
2320Match objects always have a boolean value of True.");
2321
2322PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002323"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02002324 Return subgroup(s) of the match by indices or names.\n\
2325 For 0 returns the entire match.");
2326
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002327static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002328match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
Guido van Rossumb700df92000-03-31 14:59:30 +00002329{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002330 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002331 return PyLong_FromSsize_t(self->lastindex);
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002332 Py_RETURN_NONE;
Guido van Rossumb700df92000-03-31 14:59:30 +00002333}
2334
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002335static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002336match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002337{
Serhiy Storchakacd85d0b2017-04-16 09:39:30 +03002338 if (self->pattern->indexgroup &&
2339 self->lastindex >= 0 &&
2340 self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2341 {
2342 PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2343 self->lastindex);
2344 Py_INCREF(result);
2345 return result;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002346 }
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002347 Py_RETURN_NONE;
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002348}
2349
2350static PyObject *
Serhiy Storchakad4f9cf52018-11-27 19:34:35 +02002351match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002352{
2353 if (self->regs) {
2354 Py_INCREF(self->regs);
2355 return self->regs;
2356 } else
2357 return match_regs(self);
2358}
2359
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002360static PyObject *
2361match_repr(MatchObject *self)
2362{
2363 PyObject *result;
2364 PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2365 if (group0 == NULL)
2366 return NULL;
2367 result = PyUnicode_FromFormat(
sth8b91eda2019-03-10 11:29:14 +01002368 "<%s object; span=(%zd, %zd), match=%.50R>",
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03002369 Py_TYPE(self)->tp_name,
2370 self->mark[0], self->mark[1], group0);
2371 Py_DECREF(group0);
2372 return result;
2373}
2374
2375
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002376static PyObject*
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002377pattern_new_match(_sremodulestate* module_state,
2378 PatternObject* pattern,
2379 SRE_STATE* state,
2380 Py_ssize_t status)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002381{
2382 /* create match object (from state object) */
2383
2384 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002385 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002386 char* base;
2387 int n;
2388
2389 if (status > 0) {
2390
2391 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00002392 /* coverity[ampersand_in_size] */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002393 match = PyObject_NewVar(MatchObject,
2394 module_state->Match_Type,
Victor Stinner92055202020-04-08 00:38:15 +02002395 2*(pattern->groups+1));
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002396 if (!match)
2397 return NULL;
2398
2399 Py_INCREF(pattern);
2400 match->pattern = pattern;
2401
2402 Py_INCREF(state->string);
2403 match->string = state->string;
2404
2405 match->regs = NULL;
2406 match->groups = pattern->groups+1;
2407
2408 /* fill in group slices */
2409
2410 base = (char*) state->beginning;
2411 n = state->charsize;
2412
2413 match->mark[0] = ((char*) state->start - base) / n;
2414 match->mark[1] = ((char*) state->ptr - base) / n;
2415
2416 for (i = j = 0; i < pattern->groups; i++, j+=2)
2417 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2418 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2419 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2420 } else
2421 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2422
2423 match->pos = state->pos;
2424 match->endpos = state->endpos;
2425
2426 match->lastindex = state->lastindex;
2427
2428 return (PyObject*) match;
2429
2430 } else if (status == 0) {
2431
2432 /* no match */
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02002433 Py_RETURN_NONE;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002434
2435 }
2436
2437 /* internal error */
2438 pattern_error(status);
2439 return NULL;
2440}
2441
2442
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002443/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002444/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002445
2446static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00002447scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002448{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002449 PyTypeObject *tp = Py_TYPE(self);
2450
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002451 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002452 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002453 PyObject_DEL(self);
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002454 Py_DECREF(tp);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002455}
2456
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002457/*[clinic input]
2458_sre.SRE_Scanner.match
2459
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002460 cls: defining_class
2461 /
2462
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002463[clinic start generated code]*/
2464
2465static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002466_sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2467/*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002468{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002469 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002470 SRE_STATE* state = &self->state;
2471 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002472 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002473
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002474 if (state->start == NULL)
2475 Py_RETURN_NONE;
2476
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002477 state_reset(state);
2478
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002479 state->ptr = state->start;
2480
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002481 status = sre_match(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002482 if (PyErr_Occurred())
2483 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002484
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002485 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2486 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002487
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002488 if (status == 0)
2489 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002490 else {
2491 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002492 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002493 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002494
2495 return match;
2496}
2497
2498
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002499/*[clinic input]
2500_sre.SRE_Scanner.search
2501
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002502 cls: defining_class
2503 /
2504
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002505[clinic start generated code]*/
2506
2507static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002508_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2509/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002510{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002511 _sremodulestate *module_state = get_sre_module_state_by_class(cls);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002512 SRE_STATE* state = &self->state;
2513 PyObject* match;
Victor Stinner7a6d7cf2012-10-31 00:37:41 +01002514 Py_ssize_t status;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002515
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002516 if (state->start == NULL)
2517 Py_RETURN_NONE;
2518
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00002519 state_reset(state);
2520
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002521 state->ptr = state->start;
2522
Serhiy Storchaka9eabac62013-10-26 10:45:48 +03002523 status = sre_search(state, PatternObject_GetCode(self->pattern));
Thomas Wouters89f507f2006-12-13 04:49:30 +00002524 if (PyErr_Occurred())
2525 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002526
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002527 match = pattern_new_match(module_state, (PatternObject*) self->pattern,
2528 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002529
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002530 if (status == 0)
2531 state->start = NULL;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002532 else {
2533 state->must_advance = (state->ptr == state->start);
Serhiy Storchaka03d6ee32015-07-06 13:58:33 +03002534 state->start = state->ptr;
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02002535 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002536
2537 return match;
2538}
2539
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002540static PyObject *
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002541pattern_scanner(_sremodulestate *module_state,
2542 PatternObject *self,
2543 PyObject *string,
2544 Py_ssize_t pos,
2545 Py_ssize_t endpos)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002546{
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002547 ScannerObject* scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002548
2549 /* create scanner object */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002550 scanner = PyObject_New(ScannerObject, module_state->Scanner_Type);
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002551 if (!scanner)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002552 return NULL;
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002553 scanner->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002554
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002555 /* create search state object */
2556 if (!state_init(&scanner->state, self, string, pos, endpos)) {
2557 Py_DECREF(scanner);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002558 return NULL;
2559 }
2560
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002561 Py_INCREF(self);
2562 scanner->pattern = (PyObject*) self;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002563
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002564 return (PyObject*) scanner;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002565}
2566
Victor Stinnerb44fb122016-11-21 16:35:08 +01002567static Py_hash_t
2568pattern_hash(PatternObject *self)
2569{
2570 Py_hash_t hash, hash2;
2571
2572 hash = PyObject_Hash(self->pattern);
2573 if (hash == -1) {
2574 return -1;
2575 }
2576
2577 hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
2578 hash ^= hash2;
2579
2580 hash ^= self->flags;
2581 hash ^= self->isbytes;
2582 hash ^= self->codesize;
2583
2584 if (hash == -1) {
2585 hash = -2;
2586 }
2587 return hash;
2588}
2589
2590static PyObject*
2591pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
2592{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002593 PyTypeObject *tp = Py_TYPE(lefto);
2594 _sremodulestate *module_state = get_sre_module_state_by_class(tp);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002595 PatternObject *left, *right;
2596 int cmp;
2597
2598 if (op != Py_EQ && op != Py_NE) {
2599 Py_RETURN_NOTIMPLEMENTED;
2600 }
2601
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002602 if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
2603 {
Victor Stinnerb44fb122016-11-21 16:35:08 +01002604 Py_RETURN_NOTIMPLEMENTED;
2605 }
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01002606
2607 if (lefto == righto) {
2608 /* a pattern is equal to itself */
2609 return PyBool_FromLong(op == Py_EQ);
2610 }
2611
Victor Stinnerb44fb122016-11-21 16:35:08 +01002612 left = (PatternObject *)lefto;
2613 right = (PatternObject *)righto;
2614
2615 cmp = (left->flags == right->flags
2616 && left->isbytes == right->isbytes
Victor Stinnere670b2d2016-11-22 15:23:00 +01002617 && left->codesize == right->codesize);
Victor Stinnerb44fb122016-11-21 16:35:08 +01002618 if (cmp) {
2619 /* Compare the code and the pattern because the same pattern can
2620 produce different codes depending on the locale used to compile the
2621 pattern when the re.LOCALE flag is used. Don't compare groups,
2622 indexgroup nor groupindex: they are derivated from the pattern. */
2623 cmp = (memcmp(left->code, right->code,
2624 sizeof(left->code[0]) * left->codesize) == 0);
2625 }
2626 if (cmp) {
2627 cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
2628 Py_EQ);
2629 if (cmp < 0) {
2630 return NULL;
2631 }
2632 }
2633 if (op == Py_NE) {
2634 cmp = !cmp;
2635 }
2636 return PyBool_FromLong(cmp);
2637}
2638
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002639#include "clinic/_sre.c.h"
2640
2641static PyMethodDef pattern_methods[] = {
2642 _SRE_SRE_PATTERN_MATCH_METHODDEF
2643 _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
2644 _SRE_SRE_PATTERN_SEARCH_METHODDEF
2645 _SRE_SRE_PATTERN_SUB_METHODDEF
2646 _SRE_SRE_PATTERN_SUBN_METHODDEF
2647 _SRE_SRE_PATTERN_FINDALL_METHODDEF
2648 _SRE_SRE_PATTERN_SPLIT_METHODDEF
2649 _SRE_SRE_PATTERN_FINDITER_METHODDEF
2650 _SRE_SRE_PATTERN_SCANNER_METHODDEF
2651 _SRE_SRE_PATTERN___COPY___METHODDEF
2652 _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002653 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2654 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002655 {NULL, NULL}
2656};
2657
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002658static PyGetSetDef pattern_getset[] = {
2659 {"groupindex", (getter)pattern_groupindex, (setter)NULL,
2660 "A dictionary mapping group names to group numbers."},
2661 {NULL} /* Sentinel */
2662};
2663
2664#define PAT_OFF(x) offsetof(PatternObject, x)
2665static PyMemberDef pattern_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002666 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
2667 "The pattern string from which the RE object was compiled."},
2668 {"flags", T_INT, PAT_OFF(flags), READONLY,
2669 "The regex matching flags."},
2670 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
2671 "The number of capturing groups in the pattern."},
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002672 {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002673 {NULL} /* Sentinel */
2674};
2675
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002676static PyType_Slot pattern_slots[] = {
2677 {Py_tp_dealloc, (destructor)pattern_dealloc},
2678 {Py_tp_repr, (reprfunc)pattern_repr},
2679 {Py_tp_hash, (hashfunc)pattern_hash},
2680 {Py_tp_doc, (void *)pattern_doc},
2681 {Py_tp_richcompare, pattern_richcompare},
2682 {Py_tp_methods, pattern_methods},
2683 {Py_tp_members, pattern_members},
2684 {Py_tp_getset, pattern_getset},
2685 {0, NULL},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002686};
2687
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002688static PyType_Spec pattern_spec = {
2689 .name = "re.Pattern",
2690 .basicsize = sizeof(PatternObject),
2691 .itemsize = sizeof(SRE_CODE),
2692 .flags = Py_TPFLAGS_DEFAULT,
2693 .slots = pattern_slots,
Eric V. Smith605bdae2016-09-11 08:55:43 -04002694};
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002695
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002696static PyMethodDef match_methods[] = {
2697 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
2698 _SRE_SRE_MATCH_START_METHODDEF
2699 _SRE_SRE_MATCH_END_METHODDEF
2700 _SRE_SRE_MATCH_SPAN_METHODDEF
2701 _SRE_SRE_MATCH_GROUPS_METHODDEF
2702 _SRE_SRE_MATCH_GROUPDICT_METHODDEF
2703 _SRE_SRE_MATCH_EXPAND_METHODDEF
2704 _SRE_SRE_MATCH___COPY___METHODDEF
2705 _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
Guido van Rossum48b069a2020-04-07 09:50:06 -07002706 {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS,
2707 PyDoc_STR("See PEP 585")},
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002708 {NULL, NULL}
2709};
2710
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002711static PyGetSetDef match_getset[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002712 {"lastindex", (getter)match_lastindex_get, (setter)NULL,
2713 "The integer index of the last matched capturing group."},
2714 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
2715 "The name of the last matched capturing group."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002716 {"regs", (getter)match_regs_get, (setter)NULL},
2717 {NULL}
2718};
2719
2720#define MATCH_OFF(x) offsetof(MatchObject, x)
2721static PyMemberDef match_members[] = {
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03002722 {"string", T_OBJECT, MATCH_OFF(string), READONLY,
2723 "The string passed to match() or search()."},
2724 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
2725 "The regular expression object."},
2726 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
2727 "The index into the string at which the RE engine started looking for a match."},
2728 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
2729 "The index into the string beyond which the RE engine will not go."},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002730 {NULL}
2731};
2732
2733/* FIXME: implement setattr("string", None) as a special case (to
2734 detach the associated string, if any */
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002735static PyType_Slot match_slots[] = {
2736 {Py_tp_dealloc, match_dealloc},
2737 {Py_tp_repr, match_repr},
2738 {Py_tp_doc, (void *)match_doc},
2739 {Py_tp_methods, match_methods},
2740 {Py_tp_members, match_members},
2741 {Py_tp_getset, match_getset},
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002742
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002743 /* As mapping.
2744 *
2745 * Match objects do not support length or assignment, but do support
2746 * __getitem__.
2747 */
2748 {Py_mp_subscript, match_getitem},
2749
2750 {0, NULL},
2751};
2752
2753static PyType_Spec match_spec = {
2754 .name = "re.Match",
2755 .basicsize = sizeof(MatchObject),
2756 .itemsize = sizeof(Py_ssize_t),
2757 .flags = Py_TPFLAGS_DEFAULT,
2758 .slots = match_slots,
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002759};
2760
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002761static PyMethodDef scanner_methods[] = {
2762 _SRE_SRE_SCANNER_MATCH_METHODDEF
2763 _SRE_SRE_SCANNER_SEARCH_METHODDEF
2764 {NULL, NULL}
2765};
2766
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002767#define SCAN_OFF(x) offsetof(ScannerObject, x)
2768static PyMemberDef scanner_members[] = {
2769 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
2770 {NULL} /* Sentinel */
2771};
2772
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002773static PyType_Slot scanner_slots[] = {
2774 {Py_tp_dealloc, scanner_dealloc},
2775 {Py_tp_methods, scanner_methods},
2776 {Py_tp_members, scanner_members},
2777 {0, NULL},
2778};
2779
2780static PyType_Spec scanner_spec = {
2781 .name = "_" SRE_MODULE ".SRE_Scanner",
2782 .basicsize = sizeof(ScannerObject),
2783 .flags = Py_TPFLAGS_DEFAULT,
2784 .slots = scanner_slots,
Larry Hastings2d0a69a2015-05-03 14:49:19 -07002785};
2786
Guido van Rossumb700df92000-03-31 14:59:30 +00002787static PyMethodDef _functions[] = {
Serhiy Storchakaa860aea2015-05-03 15:54:23 +03002788 _SRE_COMPILE_METHODDEF
2789 _SRE_GETCODESIZE_METHODDEF
Serhiy Storchaka6d336a02017-05-09 23:37:14 +03002790 _SRE_ASCII_ISCASED_METHODDEF
2791 _SRE_UNICODE_ISCASED_METHODDEF
Serhiy Storchaka7186cc22017-05-05 10:42:46 +03002792 _SRE_ASCII_TOLOWER_METHODDEF
2793 _SRE_UNICODE_TOLOWER_METHODDEF
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002794 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002795};
2796
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002797static int
2798sre_traverse(PyObject *module, visitproc visit, void *arg)
2799{
2800 _sremodulestate *state = get_sre_module_state(module);
2801
2802 Py_VISIT(state->Pattern_Type);
2803 Py_VISIT(state->Match_Type);
2804 Py_VISIT(state->Scanner_Type);
2805
2806 return 0;
2807}
2808
2809static int
2810sre_clear(PyObject *module)
2811{
2812 _sremodulestate *state = get_sre_module_state(module);
2813
2814 Py_CLEAR(state->Pattern_Type);
2815 Py_CLEAR(state->Match_Type);
2816 Py_CLEAR(state->Scanner_Type);
2817
2818 return 0;
2819}
2820
2821static void
2822sre_free(void *module)
2823{
2824 sre_clear((PyObject *)module);
2825}
2826
2827#define CREATE_TYPE(m, type, spec) \
2828do { \
2829 type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
2830 if (type == NULL) { \
2831 goto error; \
2832 } \
2833} while (0)
2834
2835#define ADD_ULONG_CONSTANT(module, name, value) \
2836 do { \
2837 PyObject *o = PyLong_FromUnsignedLong(value); \
2838 if (!o) \
2839 goto error; \
2840 int res = PyModule_AddObjectRef(module, name, o); \
2841 Py_DECREF(o); \
2842 if (res < 0) { \
2843 goto error; \
2844 } \
2845} while (0)
2846
2847static int
2848sre_exec(PyObject *m)
2849{
2850 _sremodulestate *state;
2851
2852 /* Create heap types */
2853 state = get_sre_module_state(m);
2854 CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
2855 CREATE_TYPE(m, state->Match_Type, &match_spec);
2856 CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
2857
2858 if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
2859 goto error;
2860 }
2861
2862 if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
2863 goto error;
2864 }
2865
2866 ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
2867 ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
2868
2869 if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
2870 goto error;
2871 }
2872
2873 return 0;
2874
2875error:
2876 return -1;
2877}
2878
2879static PyModuleDef_Slot sre_slots[] = {
2880 {Py_mod_exec, sre_exec},
2881 {0, NULL},
Martin v. Löwis1a214512008-06-11 05:26:20 +00002882};
2883
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002884static struct PyModuleDef sremodule = {
2885 .m_base = PyModuleDef_HEAD_INIT,
2886 .m_name = "_" SRE_MODULE,
2887 .m_size = sizeof(_sremodulestate),
2888 .m_methods = _functions,
2889 .m_slots = sre_slots,
2890 .m_traverse = sre_traverse,
2891 .m_free = sre_free,
2892 .m_clear = sre_clear,
2893};
2894
2895PyMODINIT_FUNC
2896PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00002897{
Erlend Egeberg Aaslanda6109ef2020-11-20 13:36:23 +01002898 return PyModuleDef_Init(&sremodule);
Guido van Rossumb700df92000-03-31 14:59:30 +00002899}
2900
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00002901/* vim:ts=4:sw=4:et
2902*/