blob: 19571fbf07aa1b80581b0a934520d978ece15b45 [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
Victor Stinner0058b862011-09-29 03:27:47 +0200166#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
167#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
168#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
169#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
170#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171
172static unsigned int sre_lower_unicode(unsigned int ch)
173{
Victor Stinner0058b862011-09-29 03:27:47 +0200174 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175}
176
Guido van Rossumb700df92000-03-31 14:59:30 +0000177LOCAL(int)
178sre_category(SRE_CODE category, unsigned int ch)
179{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000180 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000182 case SRE_CATEGORY_DIGIT:
183 return SRE_IS_DIGIT(ch);
184 case SRE_CATEGORY_NOT_DIGIT:
185 return !SRE_IS_DIGIT(ch);
186 case SRE_CATEGORY_SPACE:
187 return SRE_IS_SPACE(ch);
188 case SRE_CATEGORY_NOT_SPACE:
189 return !SRE_IS_SPACE(ch);
190 case SRE_CATEGORY_WORD:
191 return SRE_IS_WORD(ch);
192 case SRE_CATEGORY_NOT_WORD:
193 return !SRE_IS_WORD(ch);
194 case SRE_CATEGORY_LINEBREAK:
195 return SRE_IS_LINEBREAK(ch);
196 case SRE_CATEGORY_NOT_LINEBREAK:
197 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000199 case SRE_CATEGORY_LOC_WORD:
200 return SRE_LOC_IS_WORD(ch);
201 case SRE_CATEGORY_LOC_NOT_WORD:
202 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204 case SRE_CATEGORY_UNI_DIGIT:
205 return SRE_UNI_IS_DIGIT(ch);
206 case SRE_CATEGORY_UNI_NOT_DIGIT:
207 return !SRE_UNI_IS_DIGIT(ch);
208 case SRE_CATEGORY_UNI_SPACE:
209 return SRE_UNI_IS_SPACE(ch);
210 case SRE_CATEGORY_UNI_NOT_SPACE:
211 return !SRE_UNI_IS_SPACE(ch);
212 case SRE_CATEGORY_UNI_WORD:
213 return SRE_UNI_IS_WORD(ch);
214 case SRE_CATEGORY_UNI_NOT_WORD:
215 return !SRE_UNI_IS_WORD(ch);
216 case SRE_CATEGORY_UNI_LINEBREAK:
217 return SRE_UNI_IS_LINEBREAK(ch);
218 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
219 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000220 }
221 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000222}
223
224/* helpers */
225
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000226static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000228{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000232 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000233 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234}
235
236static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000237data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000239 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000240 minsize = state->data_stack_base+size;
241 cursize = state->data_stack_size;
242 if (cursize < minsize) {
243 void* stack;
244 cursize = minsize+minsize/4+1024;
245 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000246 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000247 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249 return SRE_ERROR_MEMORY;
250 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000253 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000254 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000255}
256
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000257/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000258
259#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200260#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000263#define SRE_CHARSET sre_charset
264#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000270#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000272
Guido van Rossumb700df92000-03-31 14:59:30 +0000273#undef SRE_SEARCH
274#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000276#undef SRE_INFO
277#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000278#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000279#undef SRE_AT
280#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200283/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285#define SRE_CHAR void
286#define SRE_CHARGET(state, buf, index) \
287 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
288 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
289 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_ucharset
293#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_usearch
297
298#endif /* SRE_RECURSIVE */
299
300/* -------------------------------------------------------------------- */
301/* String matching engine */
302
303/* the following section is compiled twice, with different character
304 settings */
305
306LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200307SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000308{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000310
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000311 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000316 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000317 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_BEGINNING_LINE:
320 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000323 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 return (((void*) (ptr+state->charsize) == state->end &&
325 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000326 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000328 case SRE_AT_END_LINE:
329 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000331
Fredrik Lundh770617b2001-01-14 15:06:11 +0000332 case SRE_AT_END_STRING:
333 return ((void*) ptr == state->end);
334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 case SRE_AT_BOUNDARY:
336 if (state->beginning == state->end)
337 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200341 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_NON_BOUNDARY:
345 if (state->beginning == state->end)
346 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000349 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000352
353 case SRE_AT_LOC_BOUNDARY:
354 if (state->beginning == state->end)
355 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000361
362 case SRE_AT_LOC_NON_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000370
371 case SRE_AT_UNI_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000378 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000379
380 case SRE_AT_UNI_NON_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000388
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000389 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000390
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000391 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392}
393
394LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000395SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000396{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000400
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000401 for (;;) {
402 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000403
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000404 case SRE_OP_FAILURE:
405 return !ok;
406
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000407 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000408 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000409 if (ch == set[0])
410 return ok;
411 set++;
412 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 case SRE_OP_CATEGORY:
415 /* <CATEGORY> <code> */
416 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000418 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420
Fredrik Lundh3562f112000-07-02 12:00:07 +0000421 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000422 if (sizeof(SRE_CODE) == 2) {
423 /* <CHARSET> <bitmap> (16 bits per code word) */
424 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
425 return ok;
426 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000427 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000428 else {
429 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800430 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000431 return ok;
432 set += 8;
433 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000434 break;
435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_RANGE:
437 /* <RANGE> <lower> <upper> */
438 if (set[0] <= ch && ch <= set[1])
439 return ok;
440 set += 2;
441 break;
442
443 case SRE_OP_NEGATE:
444 ok = !ok;
445 break;
446
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 case SRE_OP_BIGCHARSET:
448 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
449 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000450 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000451 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000452
453 if (sizeof(SRE_CODE) == 2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 set += 128;
456 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
457 return ok;
458 set += count*16;
459 }
460 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000461 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
462 * warnings when c's type supports only numbers < N+1 */
463 if (!(ch & ~65535))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000465 else
466 block = -1;
467 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000468 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800469 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000470 return ok;
471 set += count*8;
472 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000473 break;
474 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000475
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000476 default:
477 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000478 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000479 return 0;
480 }
481 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000482}
483
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000484LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000485
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000486LOCAL(Py_ssize_t)
487SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000488{
489 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 char* ptr = (char *)state->ptr;
491 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000492 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 /* adjust end */
Serhiy Storchakaa0eb8092013-02-16 16:54:33 +0200495 if (maxcount < (end - ptr) / state->charsize && maxcount != SRE_MAXREPEAT)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000497
498 switch (pattern[0]) {
499
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000500 case SRE_OP_IN:
501 /* repeated set */
502 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100503 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
505 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000506 break;
507
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000508 case SRE_OP_ANY:
509 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000510 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
512 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 break;
514
515 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000516 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000518 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000519 ptr = end;
520 break;
521
522 case SRE_OP_LITERAL:
523 /* repeated literal */
524 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000525 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200526 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
527 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000528 break;
529
530 case SRE_OP_LITERAL_IGNORE:
531 /* repeated literal */
532 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000533 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
535 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000536 break;
537
538 case SRE_OP_NOT_LITERAL:
539 /* repeated non-literal */
540 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
543 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 break;
Tim Peters3d563502006-01-21 02:47:53 +0000545
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 case SRE_OP_NOT_LITERAL_IGNORE:
547 /* repeated non-literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
551 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 break;
553
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000554 default:
555 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000558 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000559 if (i < 0)
560 return i;
561 if (!i)
562 break;
563 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 ((char*)state->ptr - ptr)/state->charsize));
566 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000567 }
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, (ptr - (char*) state->ptr)/state->charsize));
570 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000571}
572
Fredrik Lundh33accc12000-08-27 20:59:47 +0000573#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000574LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
576{
577 /* check if an SRE_OP_INFO block matches at the current position.
578 returns the number of SRE_CODE objects to skip if successful, 0
579 if no match */
580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200581 char* end = state->end;
582 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000583 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584
585 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200586 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 return 0;
588
589 /* check known prefix */
590 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
591 /* <length> <skip> <prefix data> <overlap data> */
592 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000594 return 0;
595 return pattern[0] + 2 * pattern[6];
596 }
597 return pattern[0];
598}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000600
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000601/* The macros below should be used to protect recursive SRE_MATCH()
602 * calls that *failed* and do *not* return immediately (IOW, those
603 * that will backtrack). Explaining:
604 *
605 * - Recursive SRE_MATCH() returned true: that's usually a success
606 * (besides atypical cases like ASSERT_NOT), therefore there's no
607 * reason to restore lastmark;
608 *
609 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
610 * is returning to the caller: If the current SRE_MATCH() is the
611 * top function of the recursion, returning false will be a matching
612 * failure, and it doesn't matter where lastmark is pointing to.
613 * If it's *not* the top function, it will be a recursive SRE_MATCH()
614 * failure by itself, and the calling SRE_MATCH() will have to deal
615 * with the failure by the same rules explained here (it will restore
616 * lastmark by itself if necessary);
617 *
618 * - Recursive SRE_MATCH() returned false, and will continue the
619 * outside 'for' loop: must be protected when breaking, since the next
620 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000621 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000622 * - Recursive SRE_MATCH() returned false, and will be called again
623 * inside a local for/while loop: must be protected between each
624 * loop iteration, since the recursive SRE_MATCH() could do anything,
625 * and could potentially depend on lastmark.
626 *
627 * For more information, check the discussion at SF patch #712900.
628 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000629#define LASTMARK_SAVE() \
630 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000631 ctx->lastmark = state->lastmark; \
632 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000633 } while (0)
634#define LASTMARK_RESTORE() \
635 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000636 state->lastmark = ctx->lastmark; \
637 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000638 } while (0)
639
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000640#define RETURN_ERROR(i) do { return i; } while(0)
641#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
642#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
643
644#define RETURN_ON_ERROR(i) \
645 do { if (i < 0) RETURN_ERROR(i); } while (0)
646#define RETURN_ON_SUCCESS(i) \
647 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
648#define RETURN_ON_FAILURE(i) \
649 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
650
651#define SFY(x) #x
652
653#define DATA_STACK_ALLOC(state, type, ptr) \
654do { \
655 alloc_pos = state->data_stack_base; \
656 TRACE(("allocating %s in %d (%d)\n", \
657 SFY(type), alloc_pos, sizeof(type))); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300658 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000659 int j = data_stack_grow(state, sizeof(type)); \
660 if (j < 0) return j; \
661 if (ctx_pos != -1) \
662 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
663 } \
664 ptr = (type*)(state->data_stack+alloc_pos); \
665 state->data_stack_base += sizeof(type); \
666} while (0)
667
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000668#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
669do { \
670 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
671 ptr = (type*)(state->data_stack+pos); \
672} while (0)
673
674#define DATA_STACK_PUSH(state, data, size) \
675do { \
676 TRACE(("copy data in %p to %d (%d)\n", \
677 data, state->data_stack_base, size)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +0300678 if (size > state->data_stack_size - state->data_stack_base) { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000679 int j = data_stack_grow(state, size); \
680 if (j < 0) return j; \
681 if (ctx_pos != -1) \
682 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
683 } \
684 memcpy(state->data_stack+state->data_stack_base, data, size); \
685 state->data_stack_base += size; \
686} while (0)
687
688#define DATA_STACK_POP(state, data, size, discard) \
689do { \
690 TRACE(("copy data to %p from %d (%d)\n", \
691 data, state->data_stack_base-size, size)); \
692 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
693 if (discard) \
694 state->data_stack_base -= size; \
695} while (0)
696
697#define DATA_STACK_POP_DISCARD(state, size) \
698do { \
699 TRACE(("discard data from %d (%d)\n", \
700 state->data_stack_base-size, size)); \
701 state->data_stack_base -= size; \
702} while(0)
703
704#define DATA_PUSH(x) \
705 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
706#define DATA_POP(x) \
707 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000708#define DATA_POP_DISCARD(x) \
709 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
710#define DATA_ALLOC(t,p) \
711 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000712#define DATA_LOOKUP_AT(t,p,pos) \
713 DATA_STACK_LOOKUP_AT(state,t,p,pos)
714
715#define MARK_PUSH(lastmark) \
716 do if (lastmark > 0) { \
717 i = lastmark; /* ctx->lastmark may change if reallocated */ \
718 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
719 } while (0)
720#define MARK_POP(lastmark) \
721 do if (lastmark > 0) { \
722 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
723 } while (0)
724#define MARK_POP_KEEP(lastmark) \
725 do if (lastmark > 0) { \
726 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
727 } while (0)
728#define MARK_POP_DISCARD(lastmark) \
729 do if (lastmark > 0) { \
730 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
731 } while (0)
732
733#define JUMP_NONE 0
734#define JUMP_MAX_UNTIL_1 1
735#define JUMP_MAX_UNTIL_2 2
736#define JUMP_MAX_UNTIL_3 3
737#define JUMP_MIN_UNTIL_1 4
738#define JUMP_MIN_UNTIL_2 5
739#define JUMP_MIN_UNTIL_3 6
740#define JUMP_REPEAT 7
741#define JUMP_REPEAT_ONE_1 8
742#define JUMP_REPEAT_ONE_2 9
743#define JUMP_MIN_REPEAT_ONE 10
744#define JUMP_BRANCH 11
745#define JUMP_ASSERT 12
746#define JUMP_ASSERT_NOT 13
747
748#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
749 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
750 nextctx->last_ctx_pos = ctx_pos; \
751 nextctx->jump = jumpvalue; \
752 nextctx->pattern = nextpattern; \
753 ctx_pos = alloc_pos; \
754 ctx = nextctx; \
755 goto entrance; \
756 jumplabel: \
757 while (0) /* gcc doesn't like labels at end of scopes */ \
758
759typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000760 Py_ssize_t last_ctx_pos;
761 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200762 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000763 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000764 Py_ssize_t count;
765 Py_ssize_t lastmark;
766 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000767 union {
768 SRE_CODE chr;
769 SRE_REPEAT* rep;
770 } u;
771} SRE_MATCH_CONTEXT;
772
773/* check if string matches the given pattern. returns <0 for
774 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000775LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000776SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000779 Py_ssize_t alloc_pos, ctx_pos = -1;
780 Py_ssize_t i, ret = 0;
781 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000782 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000783
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000784 SRE_MATCH_CONTEXT* ctx;
785 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000786
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000787 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000788
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
790 ctx->last_ctx_pos = -1;
791 ctx->jump = JUMP_NONE;
792 ctx->pattern = pattern;
793 ctx_pos = alloc_pos;
794
795entrance:
796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200797 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798
799 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000800 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000801 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000803 TRACE(("reject (got %d chars, need %d)\n",
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200804 (end - ctx->ptr)/state->charsize, ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000805 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000806 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000807 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000808 }
809
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000811 ++sigcount;
812 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
813 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000815 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000816
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000817 case SRE_OP_MARK:
818 /* set mark */
819 /* <MARK> <gid> */
820 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
821 ctx->ptr, ctx->pattern[0]));
822 i = ctx->pattern[0];
823 if (i & 1)
824 state->lastindex = i/2 + 1;
825 if (i > state->lastmark) {
826 /* state->lastmark is the highest valid index in the
827 state->mark array. If it is increased by more than 1,
828 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000829 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000830 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 while (j < i)
832 state->mark[j++] = NULL;
833 state->lastmark = i;
834 }
835 state->mark[i] = ctx->ptr;
836 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000839 case SRE_OP_LITERAL:
840 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000841 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000842 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
843 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000845 RETURN_FAILURE;
846 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000848 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000849
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000850 case SRE_OP_NOT_LITERAL:
851 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000852 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000853 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
854 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000856 RETURN_FAILURE;
857 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000859 break;
860
861 case SRE_OP_SUCCESS:
862 /* end of pattern */
863 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
864 state->ptr = ctx->ptr;
865 RETURN_SUCCESS;
866
867 case SRE_OP_AT:
868 /* match at given position */
869 /* <AT> <code> */
870 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
871 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
872 RETURN_FAILURE;
873 ctx->pattern++;
874 break;
875
876 case SRE_OP_CATEGORY:
877 /* match at given category */
878 /* <CATEGORY> <code> */
879 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000882 RETURN_FAILURE;
883 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000885 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000886
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000887 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000888 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000889 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000890 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
892 RETURN_FAILURE;
893 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000894 break;
895
896 case SRE_OP_ANY_ALL:
897 /* match anything */
898 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000899 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
900 if (ctx->ptr >= end)
901 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000903 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000904
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000905 case SRE_OP_IN:
906 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000907 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000908 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
910 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000911 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000913 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000915 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000916 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
917 ctx->pattern, ctx->ptr, ctx->pattern[0]));
918 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000920 RETURN_FAILURE;
921 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000923 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000926 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
927 ctx->pattern, ctx->ptr, *ctx->pattern));
928 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 RETURN_FAILURE;
931 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200932 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000933 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000934
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000936 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
937 if (ctx->ptr >= end
938 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 RETURN_FAILURE;
941 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000943 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 case SRE_OP_JUMP:
946 case SRE_OP_INFO:
947 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000948 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000949 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
950 ctx->ptr, ctx->pattern[0]));
951 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 break;
953
954 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000955 /* alternation */
956 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000957 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000958 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000959 ctx->u.rep = state->repeat;
960 if (ctx->u.rep)
961 MARK_PUSH(ctx->lastmark);
962 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
963 if (ctx->pattern[1] == SRE_OP_LITERAL &&
964 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000966 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000967 if (ctx->pattern[1] == SRE_OP_IN &&
968 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000970 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000972 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 if (ret) {
974 if (ctx->u.rep)
975 MARK_POP_DISCARD(ctx->lastmark);
976 RETURN_ON_ERROR(ret);
977 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000978 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 if (ctx->u.rep)
980 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000981 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000982 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 if (ctx->u.rep)
984 MARK_POP_DISCARD(ctx->lastmark);
985 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000986
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000987 case SRE_OP_REPEAT_ONE:
988 /* match repeated sequence (maximizing regexp) */
989
990 /* this operator only works if the repeated item is
991 exactly one character wide, and we're not already
992 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000993 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000994
995 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
996
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
998 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000999
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03001000 if (ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001002
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001004
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001005 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1006 RETURN_ON_ERROR(ret);
1007 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1008 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001010
1011 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001012 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001013 string. check if the rest of the pattern matches,
1014 and backtrack if not. */
1015
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001016 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001017 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 state->ptr = ctx->ptr;
1022 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001023 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001024
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001025 LASTMARK_SAVE();
1026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028 /* tail starts with a literal. skip positions where
1029 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001032 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001033 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1035 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001038 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001039 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1042 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ret) {
1044 RETURN_ON_ERROR(ret);
1045 RETURN_SUCCESS;
1046 }
Tim Peters3d563502006-01-21 02:47:53 +00001047
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001048 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 }
1053
1054 } else {
1055 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001056 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1059 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 if (ret) {
1061 RETURN_ON_ERROR(ret);
1062 RETURN_SUCCESS;
1063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001066 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001067 }
1068 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001070
Guido van Rossum41c99e72003-04-14 17:59:34 +00001071 case SRE_OP_MIN_REPEAT_ONE:
1072 /* match repeated sequence (minimizing regexp) */
1073
1074 /* this operator only works if the repeated item is
1075 exactly one character wide, and we're not already
1076 collecting backtracking points. for other cases,
1077 use the MIN_REPEAT operator */
1078
1079 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1080
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1082 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001083
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03001084 if (ctx->pattern[1] > (end - ctx->ptr) / state->charsize)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001085 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001086
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001087 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001088
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001089 if (ctx->pattern[1] == 0)
1090 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001091 else {
1092 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001093 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1094 RETURN_ON_ERROR(ret);
1095 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001096 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001097 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001098 RETURN_FAILURE;
1099 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001100 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001102 }
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001105 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001106 state->ptr = ctx->ptr;
1107 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001108
1109 } else {
1110 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001111 LASTMARK_SAVE();
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001112 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001113 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001114 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1116 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 if (ret) {
1118 RETURN_ON_ERROR(ret);
1119 RETURN_SUCCESS;
1120 }
1121 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001122 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001123 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001124 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001126 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001130 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001132 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001133 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001135 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001136 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001137 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001138 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001139 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1140 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001141
1142 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001143 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001144 if (!ctx->u.rep) {
1145 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001146 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001147 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 ctx->u.rep->count = -1;
1149 ctx->u.rep->pattern = ctx->pattern;
1150 ctx->u.rep->prev = state->repeat;
1151 ctx->u.rep->last_ptr = NULL;
1152 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001153
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001157 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001158
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 if (ret) {
1160 RETURN_ON_ERROR(ret);
1161 RETURN_SUCCESS;
1162 }
1163 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 case SRE_OP_MAX_UNTIL:
1166 /* maximizing repeat */
1167 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1168
1169 /* FIXME: we probably need to deal with zero-width
1170 matches in here... */
1171
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001172 ctx->u.rep = state->repeat;
1173 if (!ctx->u.rep)
1174 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001177
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1181 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001183 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1187 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 if (ret) {
1189 RETURN_ON_ERROR(ret);
1190 RETURN_SUCCESS;
1191 }
1192 ctx->u.rep->count = ctx->count-1;
1193 state->ptr = ctx->ptr;
1194 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001195 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 if ((ctx->count < ctx->u.rep->pattern[2] ||
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001198 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001199 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200 /* we may have enough matches, but if we can
1201 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001203 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 MARK_PUSH(ctx->lastmark);
1205 /* zero-width match protection */
1206 DATA_PUSH(&ctx->u.rep->last_ptr);
1207 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1209 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001210 DATA_POP(&ctx->u.rep->last_ptr);
1211 if (ret) {
1212 MARK_POP_DISCARD(ctx->lastmark);
1213 RETURN_ON_ERROR(ret);
1214 RETURN_SUCCESS;
1215 }
1216 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001217 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001220 }
1221
1222 /* cannot match more repeated items here. make sure the
1223 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001224 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001226 RETURN_ON_SUCCESS(ret);
1227 state->repeat = ctx->u.rep;
1228 state->ptr = ctx->ptr;
1229 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001230
1231 case SRE_OP_MIN_UNTIL:
1232 /* minimizing repeat */
1233 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1234
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 ctx->u.rep = state->repeat;
1236 if (!ctx->u.rep)
1237 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001238
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001239 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001240
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001242
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1244 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001247 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001249 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1250 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 if (ret) {
1252 RETURN_ON_ERROR(ret);
1253 RETURN_SUCCESS;
1254 }
1255 ctx->u.rep->count = ctx->count-1;
1256 state->ptr = ctx->ptr;
1257 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001258 }
1259
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001260 LASTMARK_SAVE();
1261
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001262 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001263 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 if (ret) {
1266 RETURN_ON_ERROR(ret);
1267 RETURN_SUCCESS;
1268 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001269
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 state->repeat = ctx->u.rep;
1271 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001272
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001273 LASTMARK_RESTORE();
1274
Serhiy Storchakafa468162013-02-16 21:23:53 +02001275 if ((ctx->count >= ctx->u.rep->pattern[2]
1276 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1277 state->ptr == ctx->u.rep->last_ptr)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001278 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001279
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001280 ctx->u.rep->count = ctx->count;
Serhiy Storchakafa468162013-02-16 21:23:53 +02001281 /* zero-width match protection */
1282 DATA_PUSH(&ctx->u.rep->last_ptr);
1283 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001284 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1285 ctx->u.rep->pattern+3);
Serhiy Storchakafa468162013-02-16 21:23:53 +02001286 DATA_POP(&ctx->u.rep->last_ptr);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001287 if (ret) {
1288 RETURN_ON_ERROR(ret);
1289 RETURN_SUCCESS;
1290 }
1291 ctx->u.rep->count = ctx->count-1;
1292 state->ptr = ctx->ptr;
1293 RETURN_FAILURE;
1294
1295 case SRE_OP_GROUPREF:
1296 /* match backreference */
1297 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1298 ctx->ptr, ctx->pattern[0]));
1299 i = ctx->pattern[0];
1300 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001301 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001302 if (groupref >= state->lastmark) {
1303 RETURN_FAILURE;
1304 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 char* p = (char*) state->mark[groupref];
1306 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001307 if (!p || !e || e < p)
1308 RETURN_FAILURE;
1309 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001310 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001312 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 p += state->charsize;
1314 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001315 }
1316 }
1317 }
1318 ctx->pattern++;
1319 break;
1320
1321 case SRE_OP_GROUPREF_IGNORE:
1322 /* match backreference */
1323 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1324 ctx->ptr, ctx->pattern[0]));
1325 i = ctx->pattern[0];
1326 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001327 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001328 if (groupref >= state->lastmark) {
1329 RETURN_FAILURE;
1330 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 char* p = (char*) state->mark[groupref];
1332 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001333 if (!p || !e || e < p)
1334 RETURN_FAILURE;
1335 while (p < e) {
1336 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001337 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1338 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001339 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001340 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001342 }
1343 }
1344 }
1345 ctx->pattern++;
1346 break;
1347
1348 case SRE_OP_GROUPREF_EXISTS:
1349 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1350 ctx->ptr, ctx->pattern[0]));
1351 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1352 i = ctx->pattern[0];
1353 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001354 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001355 if (groupref >= state->lastmark) {
1356 ctx->pattern += ctx->pattern[1];
1357 break;
1358 } else {
1359 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1360 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1361 if (!p || !e || e < p) {
1362 ctx->pattern += ctx->pattern[1];
1363 break;
1364 }
1365 }
1366 }
1367 ctx->pattern += 2;
1368 break;
1369
1370 case SRE_OP_ASSERT:
1371 /* assert subpattern */
1372 /* <ASSERT> <skip> <back> <pattern> */
1373 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1374 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001376 if (state->ptr < state->beginning)
1377 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001378 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001379 RETURN_ON_FAILURE(ret);
1380 ctx->pattern += ctx->pattern[0];
1381 break;
1382
1383 case SRE_OP_ASSERT_NOT:
1384 /* assert not subpattern */
1385 /* <ASSERT_NOT> <skip> <back> <pattern> */
1386 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1387 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001389 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001390 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001391 if (ret) {
1392 RETURN_ON_ERROR(ret);
1393 RETURN_FAILURE;
1394 }
1395 }
1396 ctx->pattern += ctx->pattern[0];
1397 break;
1398
1399 case SRE_OP_FAILURE:
1400 /* immediate failure */
1401 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1402 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001403
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001404 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001405 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1406 ctx->pattern[-1]));
1407 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001408 }
1409 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001410
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001411exit:
1412 ctx_pos = ctx->last_ctx_pos;
1413 jump = ctx->jump;
1414 DATA_POP_DISCARD(ctx);
1415 if (ctx_pos == -1)
1416 return ret;
1417 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1418
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001419 switch (jump) {
1420 case JUMP_MAX_UNTIL_2:
1421 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1422 goto jump_max_until_2;
1423 case JUMP_MAX_UNTIL_3:
1424 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1425 goto jump_max_until_3;
1426 case JUMP_MIN_UNTIL_2:
1427 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1428 goto jump_min_until_2;
1429 case JUMP_MIN_UNTIL_3:
1430 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1431 goto jump_min_until_3;
1432 case JUMP_BRANCH:
1433 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1434 goto jump_branch;
1435 case JUMP_MAX_UNTIL_1:
1436 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1437 goto jump_max_until_1;
1438 case JUMP_MIN_UNTIL_1:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1440 goto jump_min_until_1;
1441 case JUMP_REPEAT:
1442 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1443 goto jump_repeat;
1444 case JUMP_REPEAT_ONE_1:
1445 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1446 goto jump_repeat_one_1;
1447 case JUMP_REPEAT_ONE_2:
1448 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1449 goto jump_repeat_one_2;
1450 case JUMP_MIN_REPEAT_ONE:
1451 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1452 goto jump_min_repeat_one;
1453 case JUMP_ASSERT:
1454 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1455 goto jump_assert;
1456 case JUMP_ASSERT_NOT:
1457 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1458 goto jump_assert_not;
1459 case JUMP_NONE:
1460 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1461 break;
1462 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001463
1464 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001465}
1466
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001467LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001468SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 char* ptr = (char*)state->start;
1471 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001472 Py_ssize_t status = 0;
1473 Py_ssize_t prefix_len = 0;
1474 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001475 SRE_CODE* prefix = NULL;
1476 SRE_CODE* charset = NULL;
1477 SRE_CODE* overlap = NULL;
1478 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001479
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001480 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001481 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001482 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001483
1484 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001485
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001486 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001487 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001488 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001490 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001492 }
1493
Fredrik Lundh3562f112000-07-02 12:00:07 +00001494 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001495 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001496 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001497 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001498 prefix_skip = pattern[6];
1499 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001500 overlap = prefix + prefix_len - 1;
1501 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001502 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001503 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001504 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001505
1506 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001507 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001508
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001509 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1510 TRACE(("charset = %p\n", charset));
1511
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001512#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001513 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001514 /* pattern starts with a known prefix. use the overlap
1515 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001516 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001518 while (ptr < end) {
1519 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001521 if (!i)
1522 break;
1523 else
1524 i = overlap[i];
1525 } else {
1526 if (++i == prefix_len) {
1527 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001528 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 state->start = ptr - (prefix_len - 1) * state->charsize;
1530 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001531 if (flags & SRE_INFO_LITERAL)
1532 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001533 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001534 if (status != 0)
1535 return status;
1536 /* close but no cigar -- try again */
1537 i = overlap[i];
1538 }
1539 break;
1540 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001543 }
1544 return 0;
1545 }
1546#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001547
Fredrik Lundh3562f112000-07-02 12:00:07 +00001548 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001549 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001550 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001551 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001553 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1555 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001556 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001557 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001558 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001559 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 ptr += state->charsize;
1561 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001562 if (flags & SRE_INFO_LITERAL)
1563 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001564 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001565 if (status != 0)
1566 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001567 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001568 } else if (charset) {
1569 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001571 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1573 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001574 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001575 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001576 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001577 state->start = ptr;
1578 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001579 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001580 if (status != 0)
1581 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001583 }
1584 } else
1585 /* general case */
1586 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001587 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 state->start = state->ptr = ptr;
1589 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001590 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001591 if (status != 0)
1592 break;
1593 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001594
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001595 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001596}
Tim Peters3d563502006-01-21 02:47:53 +00001597
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001598#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001599
1600/* -------------------------------------------------------------------- */
1601/* factories and destructors */
1602
1603/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001604static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001605static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607static int
1608sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1609{
1610 /* check if given string is a literal template (i.e. no escapes) */
1611 struct {
1612 int charsize;
1613 } state = {
1614 charsize
1615 };
1616 while (len-- > 0) {
1617 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1618 return 0;
1619 ptr += charsize;
1620 }
1621 return 1;
1622}
1623
Guido van Rossumb700df92000-03-31 14:59:30 +00001624static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001625sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001626{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001627 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001628}
1629
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001630static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001631sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001632{
1633 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001634 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001635 return NULL;
1636 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001637 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001638 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001639 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001640 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001641}
1642
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001643LOCAL(void)
1644state_reset(SRE_STATE* state)
1645{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001646 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001647 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001648
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001649 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001650 state->lastindex = -1;
1651
1652 state->repeat = NULL;
1653
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001654 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001655}
1656
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001657static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001659 int* p_logical_charsize, int* p_charsize,
1660 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001661{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001662 /* given a python object, return a data pointer, a length (in
1663 characters), and a character size. return NULL if the object
1664 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001665
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001666 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001667 Py_ssize_t size, bytes;
1668 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001669 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001670
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001671 /* Unicode objects do not support the buffer API. So, get the data
1672 directly instead. */
1673 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 if (PyUnicode_READY(string) == -1)
1675 return NULL;
1676 ptr = PyUnicode_DATA(string);
1677 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001678 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001680 return ptr;
1681 }
1682
Victor Stinner0058b862011-09-29 03:27:47 +02001683 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001684 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001685 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001686 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001687 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001688 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1689 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001690 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001691
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001692 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001693 bytes = view->len;
1694 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001695
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001696 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001697 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001698 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001699 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001700
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001701 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001702 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001703
Christian Heimes72b710a2008-05-26 13:28:38 +00001704 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001705 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001706 else {
1707 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001708 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001709 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001710
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001711 *p_length = size;
1712 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001714
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001715 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001716 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001717 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001718 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001719 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001720 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001721 err:
1722 PyBuffer_Release(view);
1723 view->buf = NULL;
1724 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001725}
1726
1727LOCAL(PyObject*)
1728state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001729 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001730{
1731 /* prepare state object */
1732
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001733 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735 void* ptr;
1736
1737 memset(state, 0, sizeof(SRE_STATE));
1738
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001739 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001740 state->lastindex = -1;
1741
Benjamin Petersone48944b2012-03-07 14:50:25 -06001742 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001743 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001744 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001745 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001746
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001747 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001748 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001749 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001750 goto err;
1751 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001752 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001753 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001754 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001755 goto err;
1756 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001757
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001758 /* adjust boundaries */
1759 if (start < 0)
1760 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001761 else if (start > length)
1762 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001763
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001764 if (end < 0)
1765 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001766 else if (end > length)
1767 end = length;
1768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001770 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001771
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001772 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001773
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001774 state->start = (void*) ((char*) ptr + start * state->charsize);
1775 state->end = (void*) ((char*) ptr + end * state->charsize);
1776
1777 Py_INCREF(string);
1778 state->string = string;
1779 state->pos = start;
1780 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001781
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001782 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001783 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001784 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001785 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001786 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001787 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001788
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001789 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001790 err:
1791 if (state->buffer.buf)
1792 PyBuffer_Release(&state->buffer);
1793 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001794}
1795
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001796LOCAL(void)
1797state_fini(SRE_STATE* state)
1798{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001799 if (state->buffer.buf)
1800 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001801 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001802 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001803}
1804
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001805/* calculate offset from start of string */
1806#define STATE_OFFSET(state, member)\
1807 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1808
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001809LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001810state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001811{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001812 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001813
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001814 index = (index - 1) * 2;
1815
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001816 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001817 if (empty)
1818 /* want empty string */
1819 i = j = 0;
1820 else {
1821 Py_INCREF(Py_None);
1822 return Py_None;
1823 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001824 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001825 i = STATE_OFFSET(state, state->mark[index]);
1826 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001827 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001828
Fredrik Lundh58100642000-08-09 09:14:35 +00001829 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001830}
1831
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001832static void
1833pattern_error(int status)
1834{
1835 switch (status) {
1836 case SRE_ERROR_RECURSION_LIMIT:
1837 PyErr_SetString(
1838 PyExc_RuntimeError,
1839 "maximum recursion limit exceeded"
1840 );
1841 break;
1842 case SRE_ERROR_MEMORY:
1843 PyErr_NoMemory();
1844 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001845 case SRE_ERROR_INTERRUPTED:
1846 /* An exception has already been raised, so let it fly */
1847 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001848 default:
1849 /* other error codes indicate compiler/engine bugs */
1850 PyErr_SetString(
1851 PyExc_RuntimeError,
1852 "internal error in regular expression engine"
1853 );
1854 }
1855}
1856
Guido van Rossumb700df92000-03-31 14:59:30 +00001857static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001858pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001859{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001860 if (self->weakreflist != NULL)
1861 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001862 if (self->view.buf)
1863 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001864 Py_XDECREF(self->pattern);
1865 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001866 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001867 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001868}
1869
1870static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001871pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001872{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001873 SRE_STATE state;
1874 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001875
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001876 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001877 Py_ssize_t start = 0;
1878 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001879 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001880 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001881 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001882 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001883
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001884 string = state_init(&state, self, string, start, end);
1885 if (!string)
1886 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001887
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001888 state.ptr = state.start;
1889
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001890 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001893 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001894 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001895 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001896 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001897
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001898 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001899 if (PyErr_Occurred())
1900 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001901
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001902 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001903
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001904 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001905}
1906
1907static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001908pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001909{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001910 SRE_STATE state;
1911 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001912
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001913 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001914 Py_ssize_t start = 0;
1915 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001916 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001917 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001918 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001919 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001920
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001921 string = state_init(&state, self, string, start, end);
1922 if (!string)
1923 return NULL;
1924
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001925 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001928 status = sre_search(&state, PatternObject_GetCode(self));
1929 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001931 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001932
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001933 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1934
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001935 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001936
Thomas Wouters89f507f2006-12-13 04:49:30 +00001937 if (PyErr_Occurred())
1938 return NULL;
1939
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001940 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001941}
1942
1943static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001944call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001945{
1946 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001947 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001948 PyObject* func;
1949 PyObject* result;
1950
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001951 if (!args)
1952 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001953 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954 if (!name)
1955 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001956 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001958 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001959 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001960 func = PyObject_GetAttrString(mod, function);
1961 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001962 if (!func)
1963 return NULL;
1964 result = PyObject_CallObject(func, args);
1965 Py_DECREF(func);
1966 Py_DECREF(args);
1967 return result;
1968}
1969
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001970#ifdef USE_BUILTIN_COPY
1971static int
1972deepcopy(PyObject** object, PyObject* memo)
1973{
1974 PyObject* copy;
1975
1976 copy = call(
1977 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001978 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001979 );
1980 if (!copy)
1981 return 0;
1982
1983 Py_DECREF(*object);
1984 *object = copy;
1985
1986 return 1; /* success */
1987}
1988#endif
1989
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001990static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001991join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001992{
1993 /* join list elements */
1994
1995 PyObject* joiner;
1996#if PY_VERSION_HEX >= 0x01060000
1997 PyObject* function;
1998 PyObject* args;
1999#endif
2000 PyObject* result;
2001
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002002 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002003 if (!joiner)
2004 return NULL;
2005
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002006 if (PyList_GET_SIZE(list) == 0) {
2007 Py_DECREF(list);
2008 return joiner;
2009 }
2010
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002011#if PY_VERSION_HEX >= 0x01060000
2012 function = PyObject_GetAttrString(joiner, "join");
2013 if (!function) {
2014 Py_DECREF(joiner);
2015 return NULL;
2016 }
2017 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002018 if (!args) {
2019 Py_DECREF(function);
2020 Py_DECREF(joiner);
2021 return NULL;
2022 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002023 PyTuple_SET_ITEM(args, 0, list);
2024 result = PyObject_CallObject(function, args);
2025 Py_DECREF(args); /* also removes list */
2026 Py_DECREF(function);
2027#else
2028 result = call(
2029 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002030 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002031 );
2032#endif
2033 Py_DECREF(joiner);
2034
2035 return result;
2036}
2037
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002038static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002039pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002040{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002041 SRE_STATE state;
2042 PyObject* list;
2043 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002044 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002045
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002046 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002047 Py_ssize_t start = 0;
2048 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002049 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002050 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002051 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002052 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002053
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002054 string = state_init(&state, self, string, start, end);
2055 if (!string)
2056 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002057
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002058 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002059 if (!list) {
2060 state_fini(&state);
2061 return NULL;
2062 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002064 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002065
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002066 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002067
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002068 state_reset(&state);
2069
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002070 state.ptr = state.start;
2071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002073 status = sre_search(&state, PatternObject_GetCode(self));
2074 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002075 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002076 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002077
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002078 if (PyErr_Occurred())
2079 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002080
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002081 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002082 if (status == 0)
2083 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002084 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002085 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002086 }
Tim Peters3d563502006-01-21 02:47:53 +00002087
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002088 /* don't bother to build a match object */
2089 switch (self->groups) {
2090 case 0:
2091 b = STATE_OFFSET(&state, state.start);
2092 e = STATE_OFFSET(&state, state.ptr);
2093 item = PySequence_GetSlice(string, b, e);
2094 if (!item)
2095 goto error;
2096 break;
2097 case 1:
2098 item = state_getslice(&state, 1, string, 1);
2099 if (!item)
2100 goto error;
2101 break;
2102 default:
2103 item = PyTuple_New(self->groups);
2104 if (!item)
2105 goto error;
2106 for (i = 0; i < self->groups; i++) {
2107 PyObject* o = state_getslice(&state, i+1, string, 1);
2108 if (!o) {
2109 Py_DECREF(item);
2110 goto error;
2111 }
2112 PyTuple_SET_ITEM(item, i, o);
2113 }
2114 break;
2115 }
2116
2117 status = PyList_Append(list, item);
2118 Py_DECREF(item);
2119 if (status < 0)
2120 goto error;
2121
2122 if (state.ptr == state.start)
2123 state.start = (void*) ((char*) state.ptr + state.charsize);
2124 else
2125 state.start = state.ptr;
2126
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002127 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002128
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002129 state_fini(&state);
2130 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002131
2132error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002133 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002134 state_fini(&state);
2135 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002136
Guido van Rossumb700df92000-03-31 14:59:30 +00002137}
2138
Fredrik Lundh703ce812001-10-24 22:16:30 +00002139#if PY_VERSION_HEX >= 0x02020000
2140static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002141pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002142{
2143 PyObject* scanner;
2144 PyObject* search;
2145 PyObject* iterator;
2146
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002147 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002148 if (!scanner)
2149 return NULL;
2150
2151 search = PyObject_GetAttrString(scanner, "search");
2152 Py_DECREF(scanner);
2153 if (!search)
2154 return NULL;
2155
2156 iterator = PyCallIter_New(search, Py_None);
2157 Py_DECREF(search);
2158
2159 return iterator;
2160}
2161#endif
2162
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002163static PyObject*
2164pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2165{
2166 SRE_STATE state;
2167 PyObject* list;
2168 PyObject* item;
2169 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002170 Py_ssize_t n;
2171 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002172 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002173
2174 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002175 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002176 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002177 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002178 &string, &maxsplit))
2179 return NULL;
2180
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002181 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002182 if (!string)
2183 return NULL;
2184
2185 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002186 if (!list) {
2187 state_fini(&state);
2188 return NULL;
2189 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002190
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002191 n = 0;
2192 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002193
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002194 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002195
2196 state_reset(&state);
2197
2198 state.ptr = state.start;
2199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002201 status = sre_search(&state, PatternObject_GetCode(self));
2202 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002203 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002204 }
2205
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002206 if (PyErr_Occurred())
2207 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002208
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002209 if (status <= 0) {
2210 if (status == 0)
2211 break;
2212 pattern_error(status);
2213 goto error;
2214 }
Tim Peters3d563502006-01-21 02:47:53 +00002215
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002216 if (state.start == state.ptr) {
2217 if (last == state.end)
2218 break;
2219 /* skip one character */
2220 state.start = (void*) ((char*) state.ptr + state.charsize);
2221 continue;
2222 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002223
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002224 /* get segment before this match */
2225 item = PySequence_GetSlice(
2226 string, STATE_OFFSET(&state, last),
2227 STATE_OFFSET(&state, state.start)
2228 );
2229 if (!item)
2230 goto error;
2231 status = PyList_Append(list, item);
2232 Py_DECREF(item);
2233 if (status < 0)
2234 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002235
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002236 /* add groups (if any) */
2237 for (i = 0; i < self->groups; i++) {
2238 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002239 if (!item)
2240 goto error;
2241 status = PyList_Append(list, item);
2242 Py_DECREF(item);
2243 if (status < 0)
2244 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002245 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002246
2247 n = n + 1;
2248
2249 last = state.start = state.ptr;
2250
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002251 }
2252
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002253 /* get segment following last match (even if empty) */
2254 item = PySequence_GetSlice(
2255 string, STATE_OFFSET(&state, last), state.endpos
2256 );
2257 if (!item)
2258 goto error;
2259 status = PyList_Append(list, item);
2260 Py_DECREF(item);
2261 if (status < 0)
2262 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002263
2264 state_fini(&state);
2265 return list;
2266
2267error:
2268 Py_DECREF(list);
2269 state_fini(&state);
2270 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002271
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002272}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002273
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002274static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002275pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002276 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002277{
2278 SRE_STATE state;
2279 PyObject* list;
2280 PyObject* item;
2281 PyObject* filter;
2282 PyObject* args;
2283 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002284 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002285 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002286 Py_ssize_t n;
2287 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002289 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002290 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002291
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002292 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002293 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002294 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002295 Py_INCREF(filter);
2296 filter_is_callable = 1;
2297 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002298 /* if not callable, check if it's a literal string */
2299 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002300 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002301 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002303 if (ptr) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 literal = sre_literal_template(b, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002305 } else {
2306 PyErr_Clear();
2307 literal = 0;
2308 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002309 if (view.buf)
2310 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002311 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002312 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002313 Py_INCREF(filter);
2314 filter_is_callable = 0;
2315 } else {
2316 /* not a literal; hand it over to the template compiler */
2317 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002318 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002319 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002320 );
2321 if (!filter)
2322 return NULL;
2323 filter_is_callable = PyCallable_Check(filter);
2324 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002325 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002326
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002327 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002328 if (!string) {
2329 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002330 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002331 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002332
2333 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002334 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002335 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002336 state_fini(&state);
2337 return NULL;
2338 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002339
2340 n = i = 0;
2341
2342 while (!count || n < count) {
2343
2344 state_reset(&state);
2345
2346 state.ptr = state.start;
2347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002349 status = sre_search(&state, PatternObject_GetCode(self));
2350 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002351 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002352 }
2353
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002354 if (PyErr_Occurred())
2355 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002356
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002357 if (status <= 0) {
2358 if (status == 0)
2359 break;
2360 pattern_error(status);
2361 goto error;
2362 }
Tim Peters3d563502006-01-21 02:47:53 +00002363
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002364 b = STATE_OFFSET(&state, state.start);
2365 e = STATE_OFFSET(&state, state.ptr);
2366
2367 if (i < b) {
2368 /* get segment before this match */
2369 item = PySequence_GetSlice(string, i, b);
2370 if (!item)
2371 goto error;
2372 status = PyList_Append(list, item);
2373 Py_DECREF(item);
2374 if (status < 0)
2375 goto error;
2376
2377 } else if (i == b && i == e && n > 0)
2378 /* ignore empty match on latest position */
2379 goto next;
2380
2381 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002382 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002383 match = pattern_new_match(self, &state, 1);
2384 if (!match)
2385 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002386 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002387 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002388 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002389 goto error;
2390 }
2391 item = PyObject_CallObject(filter, args);
2392 Py_DECREF(args);
2393 Py_DECREF(match);
2394 if (!item)
2395 goto error;
2396 } else {
2397 /* filter is literal string */
2398 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002399 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002400 }
2401
2402 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002403 if (item != Py_None) {
2404 status = PyList_Append(list, item);
2405 Py_DECREF(item);
2406 if (status < 0)
2407 goto error;
2408 }
Tim Peters3d563502006-01-21 02:47:53 +00002409
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002410 i = e;
2411 n = n + 1;
2412
2413next:
2414 /* move on */
2415 if (state.ptr == state.start)
2416 state.start = (void*) ((char*) state.ptr + state.charsize);
2417 else
2418 state.start = state.ptr;
2419
2420 }
2421
2422 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002423 if (i < state.endpos) {
2424 item = PySequence_GetSlice(string, i, state.endpos);
2425 if (!item)
2426 goto error;
2427 status = PyList_Append(list, item);
2428 Py_DECREF(item);
2429 if (status < 0)
2430 goto error;
2431 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002432
2433 state_fini(&state);
2434
Guido van Rossum4e173842001-12-07 04:25:10 +00002435 Py_DECREF(filter);
2436
Fredrik Lundhdac58492001-10-21 21:48:30 +00002437 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002438 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002439
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002440 if (!item)
2441 return NULL;
2442
2443 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002444 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002445
2446 return item;
2447
2448error:
2449 Py_DECREF(list);
2450 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002451 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002452 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002453
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002454}
2455
2456static PyObject*
2457pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2458{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002459 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002460 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002461 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002462 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002463 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002464 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002465 return NULL;
2466
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002467 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002468}
2469
2470static PyObject*
2471pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2472{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002475 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002476 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002477 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002478 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002479 return NULL;
2480
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002481 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002482}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002483
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002484static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002485pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002486{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002487#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002488 PatternObject* copy;
2489 int offset;
2490
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002491 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2492 if (!copy)
2493 return NULL;
2494
2495 offset = offsetof(PatternObject, groups);
2496
2497 Py_XINCREF(self->groupindex);
2498 Py_XINCREF(self->indexgroup);
2499 Py_XINCREF(self->pattern);
2500
2501 memcpy((char*) copy + offset, (char*) self + offset,
2502 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002503 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002504
2505 return (PyObject*) copy;
2506#else
2507 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2508 return NULL;
2509#endif
2510}
2511
2512static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002513pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002514{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002515#ifdef USE_BUILTIN_COPY
2516 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002517
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002518 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002519 if (!copy)
2520 return NULL;
2521
2522 if (!deepcopy(&copy->groupindex, memo) ||
2523 !deepcopy(&copy->indexgroup, memo) ||
2524 !deepcopy(&copy->pattern, memo)) {
2525 Py_DECREF(copy);
2526 return NULL;
2527 }
2528
2529#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002530 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2531 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002532#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002533}
2534
Raymond Hettinger94478742004-09-24 04:31:19 +00002535PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002536"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002537 Matches zero or more characters at the beginning of the string");
2538
2539PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002540"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002541 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002542 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002543
2544PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002545"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002546 Split string by the occurrences of pattern.");
2547
2548PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002549"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002550 Return a list of all non-overlapping matches of pattern in string.");
2551
2552PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002553"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002554 Return an iterator over all non-overlapping matches for the \n\
2555 RE pattern in string. For each match, the iterator returns a\n\
2556 match object.");
2557
2558PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002559"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002560 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002561 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002562
2563PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002564"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002565 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2566 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002567 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002568
2569PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2570
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002571static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002572 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002573 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002574 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002575 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002576 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002577 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002578 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002579 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002580 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002581 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002582 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002583 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002584#if PY_VERSION_HEX >= 0x02020000
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002585 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002586 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002587#endif
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002588 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002589 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2590 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002591 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002592};
2593
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002594#define PAT_OFF(x) offsetof(PatternObject, x)
2595static PyMemberDef pattern_members[] = {
2596 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2597 {"flags", T_INT, PAT_OFF(flags), READONLY},
2598 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2599 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2600 {NULL} /* Sentinel */
2601};
Guido van Rossumb700df92000-03-31 14:59:30 +00002602
Neal Norwitz57c179c2006-03-22 07:18:02 +00002603static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002604 PyVarObject_HEAD_INIT(NULL, 0)
2605 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002606 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002607 (destructor)pattern_dealloc, /* tp_dealloc */
2608 0, /* tp_print */
2609 0, /* tp_getattr */
2610 0, /* tp_setattr */
2611 0, /* tp_reserved */
2612 0, /* tp_repr */
2613 0, /* tp_as_number */
2614 0, /* tp_as_sequence */
2615 0, /* tp_as_mapping */
2616 0, /* tp_hash */
2617 0, /* tp_call */
2618 0, /* tp_str */
2619 0, /* tp_getattro */
2620 0, /* tp_setattro */
2621 0, /* tp_as_buffer */
2622 Py_TPFLAGS_DEFAULT, /* tp_flags */
2623 pattern_doc, /* tp_doc */
2624 0, /* tp_traverse */
2625 0, /* tp_clear */
2626 0, /* tp_richcompare */
2627 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2628 0, /* tp_iter */
2629 0, /* tp_iternext */
2630 pattern_methods, /* tp_methods */
2631 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002632};
2633
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002634static int _validate(PatternObject *self); /* Forward */
2635
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002636static PyObject *
2637_compile(PyObject* self_, PyObject* args)
2638{
2639 /* "compile" pattern descriptor to pattern object */
2640
2641 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002642 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002643
2644 PyObject* pattern;
2645 int flags = 0;
2646 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002647 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002648 PyObject* groupindex = NULL;
2649 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002650
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002651 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002652 &PyList_Type, &code, &groups,
2653 &groupindex, &indexgroup))
2654 return NULL;
2655
2656 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002657 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002658 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2659 if (!self)
2660 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002661 self->weakreflist = NULL;
2662 self->pattern = NULL;
2663 self->groupindex = NULL;
2664 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002665 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002666
2667 self->codesize = n;
2668
2669 for (i = 0; i < n; i++) {
2670 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002671 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002672 self->code[i] = (SRE_CODE) value;
2673 if ((unsigned long) self->code[i] != value) {
2674 PyErr_SetString(PyExc_OverflowError,
2675 "regular expression code size limit exceeded");
2676 break;
2677 }
2678 }
2679
2680 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002681 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002682 return NULL;
2683 }
2684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (pattern == Py_None) {
2686 self->logical_charsize = -1;
2687 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002689 else {
2690 Py_ssize_t p_length;
2691 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002692 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 Py_DECREF(self);
2694 return NULL;
2695 }
2696 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002697
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002698 Py_INCREF(pattern);
2699 self->pattern = pattern;
2700
2701 self->flags = flags;
2702
2703 self->groups = groups;
2704
2705 Py_XINCREF(groupindex);
2706 self->groupindex = groupindex;
2707
2708 Py_XINCREF(indexgroup);
2709 self->indexgroup = indexgroup;
2710
2711 self->weakreflist = NULL;
2712
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002713 if (!_validate(self)) {
2714 Py_DECREF(self);
2715 return NULL;
2716 }
2717
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002718 return (PyObject*) self;
2719}
2720
Guido van Rossumb700df92000-03-31 14:59:30 +00002721/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002722/* Code validation */
2723
2724/* To learn more about this code, have a look at the _compile() function in
2725 Lib/sre_compile.py. The validation functions below checks the code array
2726 for conformance with the code patterns generated there.
2727
2728 The nice thing about the generated code is that it is position-independent:
2729 all jumps are relative jumps forward. Also, jumps don't cross each other:
2730 the target of a later jump is always earlier than the target of an earlier
2731 jump. IOW, this is okay:
2732
2733 J---------J-------T--------T
2734 \ \_____/ /
2735 \______________________/
2736
2737 but this is not:
2738
2739 J---------J-------T--------T
2740 \_________\_____/ /
2741 \____________/
2742
2743 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2744 bytes wide (the latter if Python is compiled for "wide" unicode support).
2745*/
2746
2747/* Defining this one enables tracing of the validator */
2748#undef VVERBOSE
2749
2750/* Trace macro for the validator */
2751#if defined(VVERBOSE)
2752#define VTRACE(v) printf v
2753#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002754#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002755#endif
2756
2757/* Report failure */
2758#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2759
2760/* Extract opcode, argument, or skip count from code array */
2761#define GET_OP \
2762 do { \
2763 VTRACE(("%p: ", code)); \
2764 if (code >= end) FAIL; \
2765 op = *code++; \
2766 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2767 } while (0)
2768#define GET_ARG \
2769 do { \
2770 VTRACE(("%p= ", code)); \
2771 if (code >= end) FAIL; \
2772 arg = *code++; \
2773 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2774 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002775#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002776 do { \
2777 VTRACE(("%p= ", code)); \
2778 if (code >= end) FAIL; \
2779 skip = *code; \
2780 VTRACE(("%lu (skip to %p)\n", \
2781 (unsigned long)skip, code+skip)); \
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002782 if (skip-adj > end-code) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002783 FAIL; \
2784 code++; \
2785 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002786#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002787
2788static int
2789_validate_charset(SRE_CODE *code, SRE_CODE *end)
2790{
2791 /* Some variables are manipulated by the macros above */
2792 SRE_CODE op;
2793 SRE_CODE arg;
2794 SRE_CODE offset;
2795 int i;
2796
2797 while (code < end) {
2798 GET_OP;
2799 switch (op) {
2800
2801 case SRE_OP_NEGATE:
2802 break;
2803
2804 case SRE_OP_LITERAL:
2805 GET_ARG;
2806 break;
2807
2808 case SRE_OP_RANGE:
2809 GET_ARG;
2810 GET_ARG;
2811 break;
2812
2813 case SRE_OP_CHARSET:
2814 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002815 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002816 FAIL;
2817 code += offset;
2818 break;
2819
2820 case SRE_OP_BIGCHARSET:
2821 GET_ARG; /* Number of blocks */
2822 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002823 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002824 FAIL;
2825 /* Make sure that each byte points to a valid block */
2826 for (i = 0; i < 256; i++) {
2827 if (((unsigned char *)code)[i] >= arg)
2828 FAIL;
2829 }
2830 code += offset;
2831 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002832 if (offset > end-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002833 FAIL;
2834 code += offset;
2835 break;
2836
2837 case SRE_OP_CATEGORY:
2838 GET_ARG;
2839 switch (arg) {
2840 case SRE_CATEGORY_DIGIT:
2841 case SRE_CATEGORY_NOT_DIGIT:
2842 case SRE_CATEGORY_SPACE:
2843 case SRE_CATEGORY_NOT_SPACE:
2844 case SRE_CATEGORY_WORD:
2845 case SRE_CATEGORY_NOT_WORD:
2846 case SRE_CATEGORY_LINEBREAK:
2847 case SRE_CATEGORY_NOT_LINEBREAK:
2848 case SRE_CATEGORY_LOC_WORD:
2849 case SRE_CATEGORY_LOC_NOT_WORD:
2850 case SRE_CATEGORY_UNI_DIGIT:
2851 case SRE_CATEGORY_UNI_NOT_DIGIT:
2852 case SRE_CATEGORY_UNI_SPACE:
2853 case SRE_CATEGORY_UNI_NOT_SPACE:
2854 case SRE_CATEGORY_UNI_WORD:
2855 case SRE_CATEGORY_UNI_NOT_WORD:
2856 case SRE_CATEGORY_UNI_LINEBREAK:
2857 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2858 break;
2859 default:
2860 FAIL;
2861 }
2862 break;
2863
2864 default:
2865 FAIL;
2866
2867 }
2868 }
2869
2870 return 1;
2871}
2872
2873static int
2874_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2875{
2876 /* Some variables are manipulated by the macros above */
2877 SRE_CODE op;
2878 SRE_CODE arg;
2879 SRE_CODE skip;
2880
2881 VTRACE(("code=%p, end=%p\n", code, end));
2882
2883 if (code > end)
2884 FAIL;
2885
2886 while (code < end) {
2887 GET_OP;
2888 switch (op) {
2889
2890 case SRE_OP_MARK:
2891 /* We don't check whether marks are properly nested; the
2892 sre_match() code is robust even if they don't, and the worst
2893 you can get is nonsensical match results. */
2894 GET_ARG;
2895 if (arg > 2*groups+1) {
2896 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2897 FAIL;
2898 }
2899 break;
2900
2901 case SRE_OP_LITERAL:
2902 case SRE_OP_NOT_LITERAL:
2903 case SRE_OP_LITERAL_IGNORE:
2904 case SRE_OP_NOT_LITERAL_IGNORE:
2905 GET_ARG;
2906 /* The arg is just a character, nothing to check */
2907 break;
2908
2909 case SRE_OP_SUCCESS:
2910 case SRE_OP_FAILURE:
2911 /* Nothing to check; these normally end the matching process */
2912 break;
2913
2914 case SRE_OP_AT:
2915 GET_ARG;
2916 switch (arg) {
2917 case SRE_AT_BEGINNING:
2918 case SRE_AT_BEGINNING_STRING:
2919 case SRE_AT_BEGINNING_LINE:
2920 case SRE_AT_END:
2921 case SRE_AT_END_LINE:
2922 case SRE_AT_END_STRING:
2923 case SRE_AT_BOUNDARY:
2924 case SRE_AT_NON_BOUNDARY:
2925 case SRE_AT_LOC_BOUNDARY:
2926 case SRE_AT_LOC_NON_BOUNDARY:
2927 case SRE_AT_UNI_BOUNDARY:
2928 case SRE_AT_UNI_NON_BOUNDARY:
2929 break;
2930 default:
2931 FAIL;
2932 }
2933 break;
2934
2935 case SRE_OP_ANY:
2936 case SRE_OP_ANY_ALL:
2937 /* These have no operands */
2938 break;
2939
2940 case SRE_OP_IN:
2941 case SRE_OP_IN_IGNORE:
2942 GET_SKIP;
2943 /* Stop 1 before the end; we check the FAILURE below */
2944 if (!_validate_charset(code, code+skip-2))
2945 FAIL;
2946 if (code[skip-2] != SRE_OP_FAILURE)
2947 FAIL;
2948 code += skip-1;
2949 break;
2950
2951 case SRE_OP_INFO:
2952 {
2953 /* A minimal info field is
2954 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2955 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2956 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002957 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002958 SRE_CODE *newcode;
2959 GET_SKIP;
2960 newcode = code+skip-1;
2961 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002962 GET_ARG;
2963 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002964 /* Check that only valid flags are present */
2965 if ((flags & ~(SRE_INFO_PREFIX |
2966 SRE_INFO_LITERAL |
2967 SRE_INFO_CHARSET)) != 0)
2968 FAIL;
2969 /* PREFIX and CHARSET are mutually exclusive */
2970 if ((flags & SRE_INFO_PREFIX) &&
2971 (flags & SRE_INFO_CHARSET))
2972 FAIL;
2973 /* LITERAL implies PREFIX */
2974 if ((flags & SRE_INFO_LITERAL) &&
2975 !(flags & SRE_INFO_PREFIX))
2976 FAIL;
2977 /* Validate the prefix */
2978 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002979 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002980 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002981 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002982 /* Here comes the prefix string */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002983 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002984 FAIL;
2985 code += prefix_len;
2986 /* And here comes the overlap table */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03002987 if (prefix_len > newcode-code)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002988 FAIL;
2989 /* Each overlap value should be < prefix_len */
2990 for (i = 0; i < prefix_len; i++) {
2991 if (code[i] >= prefix_len)
2992 FAIL;
2993 }
2994 code += prefix_len;
2995 }
2996 /* Validate the charset */
2997 if (flags & SRE_INFO_CHARSET) {
2998 if (!_validate_charset(code, newcode-1))
2999 FAIL;
3000 if (newcode[-1] != SRE_OP_FAILURE)
3001 FAIL;
3002 code = newcode;
3003 }
3004 else if (code != newcode) {
3005 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3006 FAIL;
3007 }
3008 }
3009 break;
3010
3011 case SRE_OP_BRANCH:
3012 {
3013 SRE_CODE *target = NULL;
3014 for (;;) {
3015 GET_SKIP;
3016 if (skip == 0)
3017 break;
3018 /* Stop 2 before the end; we check the JUMP below */
3019 if (!_validate_inner(code, code+skip-3, groups))
3020 FAIL;
3021 code += skip-3;
3022 /* Check that it ends with a JUMP, and that each JUMP
3023 has the same target */
3024 GET_OP;
3025 if (op != SRE_OP_JUMP)
3026 FAIL;
3027 GET_SKIP;
3028 if (target == NULL)
3029 target = code+skip-1;
3030 else if (code+skip-1 != target)
3031 FAIL;
3032 }
3033 }
3034 break;
3035
3036 case SRE_OP_REPEAT_ONE:
3037 case SRE_OP_MIN_REPEAT_ONE:
3038 {
3039 SRE_CODE min, max;
3040 GET_SKIP;
3041 GET_ARG; min = arg;
3042 GET_ARG; max = arg;
3043 if (min > max)
3044 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003045 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003046 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003047 if (!_validate_inner(code, code+skip-4, groups))
3048 FAIL;
3049 code += skip-4;
3050 GET_OP;
3051 if (op != SRE_OP_SUCCESS)
3052 FAIL;
3053 }
3054 break;
3055
3056 case SRE_OP_REPEAT:
3057 {
3058 SRE_CODE min, max;
3059 GET_SKIP;
3060 GET_ARG; min = arg;
3061 GET_ARG; max = arg;
3062 if (min > max)
3063 FAIL;
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003064 if (max > SRE_MAXREPEAT)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003065 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003066 if (!_validate_inner(code, code+skip-3, groups))
3067 FAIL;
3068 code += skip-3;
3069 GET_OP;
3070 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3071 FAIL;
3072 }
3073 break;
3074
3075 case SRE_OP_GROUPREF:
3076 case SRE_OP_GROUPREF_IGNORE:
3077 GET_ARG;
3078 if (arg >= groups)
3079 FAIL;
3080 break;
3081
3082 case SRE_OP_GROUPREF_EXISTS:
3083 /* The regex syntax for this is: '(?(group)then|else)', where
3084 'group' is either an integer group number or a group name,
3085 'then' and 'else' are sub-regexes, and 'else' is optional. */
3086 GET_ARG;
3087 if (arg >= groups)
3088 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003089 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003090 code--; /* The skip is relative to the first arg! */
3091 /* There are two possibilities here: if there is both a 'then'
3092 part and an 'else' part, the generated code looks like:
3093
3094 GROUPREF_EXISTS
3095 <group>
3096 <skipyes>
3097 ...then part...
3098 JUMP
3099 <skipno>
3100 (<skipyes> jumps here)
3101 ...else part...
3102 (<skipno> jumps here)
3103
3104 If there is only a 'then' part, it looks like:
3105
3106 GROUPREF_EXISTS
3107 <group>
3108 <skip>
3109 ...then part...
3110 (<skip> jumps here)
3111
3112 There is no direct way to decide which it is, and we don't want
3113 to allow arbitrary jumps anywhere in the code; so we just look
3114 for a JUMP opcode preceding our skip target.
3115 */
Serhiy Storchaka4bb17342013-04-13 21:15:47 +03003116 if (skip >= 3 && skip-3 < end-code &&
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003117 code[skip-3] == SRE_OP_JUMP)
3118 {
3119 VTRACE(("both then and else parts present\n"));
3120 if (!_validate_inner(code+1, code+skip-3, groups))
3121 FAIL;
3122 code += skip-2; /* Position after JUMP, at <skipno> */
3123 GET_SKIP;
3124 if (!_validate_inner(code, code+skip-1, groups))
3125 FAIL;
3126 code += skip-1;
3127 }
3128 else {
3129 VTRACE(("only a then part present\n"));
3130 if (!_validate_inner(code+1, code+skip-1, groups))
3131 FAIL;
3132 code += skip-1;
3133 }
3134 break;
3135
3136 case SRE_OP_ASSERT:
3137 case SRE_OP_ASSERT_NOT:
3138 GET_SKIP;
3139 GET_ARG; /* 0 for lookahead, width for lookbehind */
3140 code--; /* Back up over arg to simplify math below */
3141 if (arg & 0x80000000)
3142 FAIL; /* Width too large */
3143 /* Stop 1 before the end; we check the SUCCESS below */
3144 if (!_validate_inner(code+1, code+skip-2, groups))
3145 FAIL;
3146 code += skip-2;
3147 GET_OP;
3148 if (op != SRE_OP_SUCCESS)
3149 FAIL;
3150 break;
3151
3152 default:
3153 FAIL;
3154
3155 }
3156 }
3157
3158 VTRACE(("okay\n"));
3159 return 1;
3160}
3161
3162static int
3163_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3164{
3165 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3166 FAIL;
3167 if (groups == 0) /* fix for simplejson */
3168 groups = 100; /* 100 groups should always be safe */
3169 return _validate_inner(code, end-1, groups);
3170}
3171
3172static int
3173_validate(PatternObject *self)
3174{
3175 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3176 {
3177 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3178 return 0;
3179 }
3180 else
3181 VTRACE(("Success!\n"));
3182 return 1;
3183}
3184
3185/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003186/* match methods */
3187
3188static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003189match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003190{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003191 Py_XDECREF(self->regs);
3192 Py_XDECREF(self->string);
3193 Py_DECREF(self->pattern);
3194 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003195}
3196
3197static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003198match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003199{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003200 if (index < 0 || index >= self->groups) {
3201 /* raise IndexError if we were given a bad group number */
3202 PyErr_SetString(
3203 PyExc_IndexError,
3204 "no such group"
3205 );
3206 return NULL;
3207 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003208
Fredrik Lundh6f013982000-07-03 18:44:21 +00003209 index *= 2;
3210
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003211 if (self->string == Py_None || self->mark[index] < 0) {
3212 /* return default value if the string or group is undefined */
3213 Py_INCREF(def);
3214 return def;
3215 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003216
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003217 return PySequence_GetSlice(
3218 self->string, self->mark[index], self->mark[index+1]
3219 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003220}
3221
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003222static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003223match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003224{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003225 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003226
Guido van Rossumddefaf32007-01-14 03:31:43 +00003227 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003228 /* Default value */
3229 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003230
Christian Heimes217cfd12007-12-02 14:31:20 +00003231 if (PyLong_Check(index))
3232 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003233
Fredrik Lundh6f013982000-07-03 18:44:21 +00003234 i = -1;
3235
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003236 if (self->pattern->groupindex) {
3237 index = PyObject_GetItem(self->pattern->groupindex, index);
3238 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003239 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003240 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003241 Py_DECREF(index);
3242 } else
3243 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003244 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003245
3246 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003247}
3248
3249static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003250match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003251{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003252 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003253}
3254
3255static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003256match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003257{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003258 /* delegate to Python code */
3259 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003260 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003261 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003262 );
3263}
3264
3265static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003266match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003267{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003268 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003269 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003270
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003271 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003272
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003273 switch (size) {
3274 case 0:
3275 result = match_getslice(self, Py_False, Py_None);
3276 break;
3277 case 1:
3278 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3279 break;
3280 default:
3281 /* fetch multiple items */
3282 result = PyTuple_New(size);
3283 if (!result)
3284 return NULL;
3285 for (i = 0; i < size; i++) {
3286 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003287 self, PyTuple_GET_ITEM(args, i), Py_None
3288 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003289 if (!item) {
3290 Py_DECREF(result);
3291 return NULL;
3292 }
3293 PyTuple_SET_ITEM(result, i, item);
3294 }
3295 break;
3296 }
3297 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003298}
3299
3300static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003301match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003302{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003303 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003304 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003305
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003306 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003307 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003308 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003309 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003310
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003311 result = PyTuple_New(self->groups-1);
3312 if (!result)
3313 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003315 for (index = 1; index < self->groups; index++) {
3316 PyObject* item;
3317 item = match_getslice_by_index(self, index, def);
3318 if (!item) {
3319 Py_DECREF(result);
3320 return NULL;
3321 }
3322 PyTuple_SET_ITEM(result, index-1, item);
3323 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003324
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003325 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003326}
3327
3328static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003329match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003330{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003331 PyObject* result;
3332 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003333 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003335 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003336 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003337 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003338 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003339
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003340 result = PyDict_New();
3341 if (!result || !self->pattern->groupindex)
3342 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003344 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003345 if (!keys)
3346 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003347
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003348 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003349 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003350 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003351 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003353 if (!key)
3354 goto failed;
3355 value = match_getslice(self, key, def);
3356 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003357 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003358 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003359 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003360 status = PyDict_SetItem(result, key, value);
3361 Py_DECREF(value);
3362 if (status < 0)
3363 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003364 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003365
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003366 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003367
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003368 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003369
3370failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003371 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003372 Py_DECREF(result);
3373 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003374}
3375
3376static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003377match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003378{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003379 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003380
Guido van Rossumddefaf32007-01-14 03:31:43 +00003381 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003382 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003383 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003384
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003385 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003386
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003387 if (index < 0 || index >= self->groups) {
3388 PyErr_SetString(
3389 PyExc_IndexError,
3390 "no such group"
3391 );
3392 return NULL;
3393 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003394
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003395 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003396 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003397}
3398
3399static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003400match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003401{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003402 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003403
Guido van Rossumddefaf32007-01-14 03:31:43 +00003404 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003405 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003406 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003407
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003408 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003409
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003410 if (index < 0 || index >= self->groups) {
3411 PyErr_SetString(
3412 PyExc_IndexError,
3413 "no such group"
3414 );
3415 return NULL;
3416 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003417
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003418 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003419 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003420}
3421
3422LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003423_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003424{
3425 PyObject* pair;
3426 PyObject* item;
3427
3428 pair = PyTuple_New(2);
3429 if (!pair)
3430 return NULL;
3431
Christian Heimes217cfd12007-12-02 14:31:20 +00003432 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003433 if (!item)
3434 goto error;
3435 PyTuple_SET_ITEM(pair, 0, item);
3436
Christian Heimes217cfd12007-12-02 14:31:20 +00003437 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003438 if (!item)
3439 goto error;
3440 PyTuple_SET_ITEM(pair, 1, item);
3441
3442 return pair;
3443
3444 error:
3445 Py_DECREF(pair);
3446 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003447}
3448
3449static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003450match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003451{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003452 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003453
Guido van Rossumddefaf32007-01-14 03:31:43 +00003454 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003455 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003456 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003457
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003458 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003459
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003460 if (index < 0 || index >= self->groups) {
3461 PyErr_SetString(
3462 PyExc_IndexError,
3463 "no such group"
3464 );
3465 return NULL;
3466 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003467
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003468 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003469 return _pair(self->mark[index*2], self->mark[index*2+1]);
3470}
3471
3472static PyObject*
3473match_regs(MatchObject* self)
3474{
3475 PyObject* regs;
3476 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003477 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003478
3479 regs = PyTuple_New(self->groups);
3480 if (!regs)
3481 return NULL;
3482
3483 for (index = 0; index < self->groups; index++) {
3484 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3485 if (!item) {
3486 Py_DECREF(regs);
3487 return NULL;
3488 }
3489 PyTuple_SET_ITEM(regs, index, item);
3490 }
3491
3492 Py_INCREF(regs);
3493 self->regs = regs;
3494
3495 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003496}
3497
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003498static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003499match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003500{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003501#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003502 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003503 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003504
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003505 slots = 2 * (self->pattern->groups+1);
3506
3507 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3508 if (!copy)
3509 return NULL;
3510
3511 /* this value a constant, but any compiler should be able to
3512 figure that out all by itself */
3513 offset = offsetof(MatchObject, string);
3514
3515 Py_XINCREF(self->pattern);
3516 Py_XINCREF(self->string);
3517 Py_XINCREF(self->regs);
3518
3519 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003520 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003521
3522 return (PyObject*) copy;
3523#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003524 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003525 return NULL;
3526#endif
3527}
3528
3529static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003530match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003531{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003532#ifdef USE_BUILTIN_COPY
3533 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003534
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003535 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003536 if (!copy)
3537 return NULL;
3538
3539 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3540 !deepcopy(&copy->string, memo) ||
3541 !deepcopy(&copy->regs, memo)) {
3542 Py_DECREF(copy);
3543 return NULL;
3544 }
3545
3546#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003547 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3548 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003549#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003550}
3551
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003552PyDoc_STRVAR(match_doc,
3553"The result of re.match() and re.search().\n\
3554Match objects always have a boolean value of True.");
3555
3556PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003557"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003558 Return subgroup(s) of the match by indices or names.\n\
3559 For 0 returns the entire match.");
3560
3561PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003562"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003563 Return index of the start of the substring matched by group.");
3564
3565PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003566"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003567 Return index of the end of the substring matched by group.");
3568
3569PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003570"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003571 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3572
3573PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003574"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003575 Return a tuple containing all the subgroups of the match, from 1.\n\
3576 The default argument is used for groups\n\
3577 that did not participate in the match");
3578
3579PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003580"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003581 Return a dictionary containing all the named subgroups of the match,\n\
3582 keyed by the subgroup name. The default argument is used for groups\n\
3583 that did not participate in the match");
3584
3585PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003586"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003587 Return the string obtained by doing backslash substitution\n\
3588 on the string template, as done by the sub() method.");
3589
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003590static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003591 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3592 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3593 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3594 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3595 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3596 match_groups_doc},
3597 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3598 match_groupdict_doc},
3599 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003600 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3601 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003602 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003603};
3604
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003605static PyObject *
3606match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003607{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003608 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003609 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003610 Py_INCREF(Py_None);
3611 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003612}
3613
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003614static PyObject *
3615match_lastgroup_get(MatchObject *self)
3616{
3617 if (self->pattern->indexgroup && self->lastindex >= 0) {
3618 PyObject* result = PySequence_GetItem(
3619 self->pattern->indexgroup, self->lastindex
3620 );
3621 if (result)
3622 return result;
3623 PyErr_Clear();
3624 }
3625 Py_INCREF(Py_None);
3626 return Py_None;
3627}
3628
3629static PyObject *
3630match_regs_get(MatchObject *self)
3631{
3632 if (self->regs) {
3633 Py_INCREF(self->regs);
3634 return self->regs;
3635 } else
3636 return match_regs(self);
3637}
3638
3639static PyGetSetDef match_getset[] = {
3640 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3641 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3642 {"regs", (getter)match_regs_get, (setter)NULL},
3643 {NULL}
3644};
3645
3646#define MATCH_OFF(x) offsetof(MatchObject, x)
3647static PyMemberDef match_members[] = {
3648 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3649 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3650 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3651 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3652 {NULL}
3653};
3654
Guido van Rossumb700df92000-03-31 14:59:30 +00003655/* FIXME: implement setattr("string", None) as a special case (to
3656 detach the associated string, if any */
3657
Neal Norwitz57c179c2006-03-22 07:18:02 +00003658static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003659 PyVarObject_HEAD_INIT(NULL,0)
3660 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003661 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003662 (destructor)match_dealloc, /* tp_dealloc */
3663 0, /* tp_print */
3664 0, /* tp_getattr */
3665 0, /* tp_setattr */
3666 0, /* tp_reserved */
3667 0, /* tp_repr */
3668 0, /* tp_as_number */
3669 0, /* tp_as_sequence */
3670 0, /* tp_as_mapping */
3671 0, /* tp_hash */
3672 0, /* tp_call */
3673 0, /* tp_str */
3674 0, /* tp_getattro */
3675 0, /* tp_setattro */
3676 0, /* tp_as_buffer */
3677 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003678 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003679 0, /* tp_traverse */
3680 0, /* tp_clear */
3681 0, /* tp_richcompare */
3682 0, /* tp_weaklistoffset */
3683 0, /* tp_iter */
3684 0, /* tp_iternext */
3685 match_methods, /* tp_methods */
3686 match_members, /* tp_members */
3687 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003688};
3689
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003690static PyObject*
3691pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3692{
3693 /* create match object (from state object) */
3694
3695 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003696 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003697 char* base;
3698 int n;
3699
3700 if (status > 0) {
3701
3702 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003703 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003704 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3705 2*(pattern->groups+1));
3706 if (!match)
3707 return NULL;
3708
3709 Py_INCREF(pattern);
3710 match->pattern = pattern;
3711
3712 Py_INCREF(state->string);
3713 match->string = state->string;
3714
3715 match->regs = NULL;
3716 match->groups = pattern->groups+1;
3717
3718 /* fill in group slices */
3719
3720 base = (char*) state->beginning;
3721 n = state->charsize;
3722
3723 match->mark[0] = ((char*) state->start - base) / n;
3724 match->mark[1] = ((char*) state->ptr - base) / n;
3725
3726 for (i = j = 0; i < pattern->groups; i++, j+=2)
3727 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3728 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3729 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3730 } else
3731 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3732
3733 match->pos = state->pos;
3734 match->endpos = state->endpos;
3735
3736 match->lastindex = state->lastindex;
3737
3738 return (PyObject*) match;
3739
3740 } else if (status == 0) {
3741
3742 /* no match */
3743 Py_INCREF(Py_None);
3744 return Py_None;
3745
3746 }
3747
3748 /* internal error */
3749 pattern_error(status);
3750 return NULL;
3751}
3752
3753
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003754/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003755/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003756
3757static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003758scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003759{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003760 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003761 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003762 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003763}
3764
3765static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003766scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003767{
3768 SRE_STATE* state = &self->state;
3769 PyObject* match;
3770 int status;
3771
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003772 state_reset(state);
3773
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003774 state->ptr = state->start;
3775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003777 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003778 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003779 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003780 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003781 if (PyErr_Occurred())
3782 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003783
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003784 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003785 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003786
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003787 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003788 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003789 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003790 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003791
3792 return match;
3793}
3794
3795
3796static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003797scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003798{
3799 SRE_STATE* state = &self->state;
3800 PyObject* match;
3801 int status;
3802
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003803 state_reset(state);
3804
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003805 state->ptr = state->start;
3806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003808 status = sre_search(state, PatternObject_GetCode(self->pattern));
3809 } else {
3810 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3811 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003812 if (PyErr_Occurred())
3813 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003814
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003815 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003816 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003817
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003818 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003819 state->start = (void*) ((char*) state->ptr + state->charsize);
3820 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003821 state->start = state->ptr;
3822
3823 return match;
3824}
3825
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003826static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003827 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3828 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003829 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003830};
3831
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003832#define SCAN_OFF(x) offsetof(ScannerObject, x)
3833static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003834 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003835 {NULL} /* Sentinel */
3836};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003837
Neal Norwitz57c179c2006-03-22 07:18:02 +00003838static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003839 PyVarObject_HEAD_INIT(NULL, 0)
3840 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003841 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003842 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003843 0, /* tp_print */
3844 0, /* tp_getattr */
3845 0, /* tp_setattr */
3846 0, /* tp_reserved */
3847 0, /* tp_repr */
3848 0, /* tp_as_number */
3849 0, /* tp_as_sequence */
3850 0, /* tp_as_mapping */
3851 0, /* tp_hash */
3852 0, /* tp_call */
3853 0, /* tp_str */
3854 0, /* tp_getattro */
3855 0, /* tp_setattro */
3856 0, /* tp_as_buffer */
3857 Py_TPFLAGS_DEFAULT, /* tp_flags */
3858 0, /* tp_doc */
3859 0, /* tp_traverse */
3860 0, /* tp_clear */
3861 0, /* tp_richcompare */
3862 0, /* tp_weaklistoffset */
3863 0, /* tp_iter */
3864 0, /* tp_iternext */
3865 scanner_methods, /* tp_methods */
3866 scanner_members, /* tp_members */
3867 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003868};
3869
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003870static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003871pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003872{
3873 /* create search state object */
3874
3875 ScannerObject* self;
3876
3877 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003878 Py_ssize_t start = 0;
3879 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003880 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3881 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3882 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003883 return NULL;
3884
3885 /* create scanner object */
3886 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3887 if (!self)
3888 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003889 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003890
3891 string = state_init(&self->state, pattern, string, start, end);
3892 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003893 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003894 return NULL;
3895 }
3896
3897 Py_INCREF(pattern);
3898 self->pattern = (PyObject*) pattern;
3899
3900 return (PyObject*) self;
3901}
3902
Guido van Rossumb700df92000-03-31 14:59:30 +00003903static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003904 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003905 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003906 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003907 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003908};
3909
Martin v. Löwis1a214512008-06-11 05:26:20 +00003910static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003911 PyModuleDef_HEAD_INIT,
3912 "_" SRE_MODULE,
3913 NULL,
3914 -1,
3915 _functions,
3916 NULL,
3917 NULL,
3918 NULL,
3919 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003920};
3921
3922PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003923{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003924 PyObject* m;
3925 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003926 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003927
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003928 /* Patch object types */
3929 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3930 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003931 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003932
Martin v. Löwis1a214512008-06-11 05:26:20 +00003933 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003934 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003935 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003936 d = PyModule_GetDict(m);
3937
Christian Heimes217cfd12007-12-02 14:31:20 +00003938 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003939 if (x) {
3940 PyDict_SetItemString(d, "MAGIC", x);
3941 Py_DECREF(x);
3942 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003943
Christian Heimes217cfd12007-12-02 14:31:20 +00003944 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003945 if (x) {
3946 PyDict_SetItemString(d, "CODESIZE", x);
3947 Py_DECREF(x);
3948 }
3949
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02003950 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3951 if (x) {
3952 PyDict_SetItemString(d, "MAXREPEAT", x);
3953 Py_DECREF(x);
3954 }
3955
Neal Norwitzfe537132007-08-26 03:55:15 +00003956 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003957 if (x) {
3958 PyDict_SetItemString(d, "copyright", x);
3959 Py_DECREF(x);
3960 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003961 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003962}
3963
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003964#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003965
3966/* vim:ts=4:sw=4:et
3967*/