blob: aa56529f90f6424e9af3124d4db2920d34fbcccc [file] [log] [blame]
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001/*
Guido van Rossumb700df92000-03-31 14:59:30 +00002 * Secret Labs' Regular Expression Engine
Guido van Rossumb700df92000-03-31 14:59:30 +00003 *
Fredrik Lundh6c68dc72000-06-29 10:34:56 +00004 * regular expression matching engine
Guido van Rossumb700df92000-03-31 14:59:30 +00005 *
6 * partial history:
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00007 * 1999-10-24 fl created (based on existing template matcher code)
Fredrik Lundhebc37b22000-10-28 19:30:41 +00008 * 2000-03-06 fl first alpha, sort of
Fredrik Lundhebc37b22000-10-28 19:30:41 +00009 * 2000-08-01 fl fixes for 1.6b1
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000010 * 2000-08-07 fl use PyOS_CheckStack() if available
Fredrik Lundh5644b7f2000-09-21 17:03:25 +000011 * 2000-09-20 fl added expand method
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000012 * 2001-03-20 fl lots of fixes for 2.1b2
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000013 * 2001-04-15 fl export copyright as Python attribute, not global
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000014 * 2001-04-28 fl added __copy__ methods (work in progress)
Fredrik Lundh09705f02002-11-22 12:46:35 +000015 * 2001-05-14 fl fixes for 1.5.2 compatibility
Fredrik Lundhf71ae462001-07-02 17:04:48 +000016 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
Fredrik Lundh397a6542001-10-18 19:30:16 +000017 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
Fredrik Lundh971e78b2001-10-20 17:48:46 +000018 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
Fredrik Lundhbec95b92001-10-21 16:47:57 +000019 * 2001-10-21 fl added sub/subn primitive
Fredrik Lundh703ce812001-10-24 22:16:30 +000020 * 2001-10-24 fl added finditer primitive (for 2.2 only)
Fredrik Lundh82b23072001-12-09 16:13:15 +000021 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
Fredrik Lundh09705f02002-11-22 12:46:35 +000022 * 2002-11-09 fl fixed empty sub/subn return type
Martin v. Löwis78e2f062003-04-19 12:56:08 +000023 * 2003-04-18 mvl fully support 4-byte codes
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +000024 * 2003-10-17 gn implemented non recursive scheme
Guido van Rossumb700df92000-03-31 14:59:30 +000025 *
Fredrik Lundh770617b2001-01-14 15:06:11 +000026 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossumb700df92000-03-31 14:59:30 +000027 *
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000028 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
Guido van Rossumb700df92000-03-31 14:59:30 +000032 * Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000033 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossumb700df92000-03-31 14:59:30 +000034 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
Fredrik Lundh9c7eab82001-04-15 19:00:58 +000039static char copyright[] =
Fredrik Lundh09705f02002-11-22 12:46:35 +000040 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
Guido van Rossumb700df92000-03-31 14:59:30 +000041
Thomas Wouters0e3f5912006-08-11 14:57:12 +000042#define PY_SSIZE_T_CLEAN
43
Guido van Rossumb700df92000-03-31 14:59:30 +000044#include "Python.h"
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000045#include "structmember.h" /* offsetof */
Guido van Rossumb700df92000-03-31 14:59:30 +000046
47#include "sre.h"
48
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000049#include <ctype.h>
Guido van Rossumb700df92000-03-31 14:59:30 +000050
Fredrik Lundh436c3d582000-06-29 08:58:44 +000051/* name of this module, minus the leading underscore */
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000052#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
Fredrik Lundh436c3d582000-06-29 08:58:44 +000055
Thomas Wouters9ada3d62006-04-21 09:47:09 +000056#define SRE_PY_MODULE "re"
57
Guido van Rossumb700df92000-03-31 14:59:30 +000058/* defining this one enables tracing */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000059#undef VERBOSE
Guido van Rossumb700df92000-03-31 14:59:30 +000060
Fredrik Lundh22d25462000-07-01 17:50:59 +000061/* defining this enables unicode support (default under 1.6a1 and later) */
Fredrik Lundh436c3d582000-06-29 08:58:44 +000062#define HAVE_UNICODE
Fredrik Lundh436c3d582000-06-29 08:58:44 +000063
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000064/* -------------------------------------------------------------------- */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000065/* optional features */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000066
67/* enables fast searching */
Fredrik Lundh29c08be2000-06-29 23:33:12 +000068#define USE_FAST_SEARCH
69
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +000070/* enables copy/deepcopy handling (work in progress) */
71#undef USE_BUILTIN_COPY
72
Fredrik Lundh1c5aa692001-01-16 07:37:30 +000073#if PY_VERSION_HEX < 0x01060000
74#define PyObject_DEL(op) PyMem_DEL((op))
75#endif
76
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000077/* -------------------------------------------------------------------- */
78
Fredrik Lundh80946112000-06-29 18:03:25 +000079#if defined(_MSC_VER)
Guido van Rossumb700df92000-03-31 14:59:30 +000080#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
Fredrik Lundh28552902000-07-05 21:14:16 +000081#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
Guido van Rossumb700df92000-03-31 14:59:30 +000082/* fastest possible local call under MSVC */
83#define LOCAL(type) static __inline type __fastcall
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000084#elif defined(USE_INLINE)
Fredrik Lundh29c08be2000-06-29 23:33:12 +000085#define LOCAL(type) static inline type
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000086#else
87#define LOCAL(type) static type
Guido van Rossumb700df92000-03-31 14:59:30 +000088#endif
89
90/* error codes */
91#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000092#define SRE_ERROR_STATE -2 /* illegal state */
Fredrik Lundh96ab4652000-08-03 16:29:50 +000093#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
Guido van Rossumb700df92000-03-31 14:59:30 +000094#define SRE_ERROR_MEMORY -9 /* out of memory */
Christian Heimes2380ac72008-01-09 00:17:24 +000095#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
Guido van Rossumb700df92000-03-31 14:59:30 +000096
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000097#if defined(VERBOSE)
Guido van Rossumb700df92000-03-31 14:59:30 +000098#define TRACE(v) printf v
Guido van Rossumb700df92000-03-31 14:59:30 +000099#else
100#define TRACE(v)
101#endif
102
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103/* -------------------------------------------------------------------- */
104/* search engine state */
Guido van Rossumb700df92000-03-31 14:59:30 +0000105
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000106/* default character predicates (run sre_chars.py to regenerate tables) */
107
108#define SRE_DIGIT_MASK 1
109#define SRE_SPACE_MASK 2
110#define SRE_LINEBREAK_MASK 4
111#define SRE_ALNUM_MASK 8
112#define SRE_WORD_MASK 16
113
Fredrik Lundh21009b92001-09-18 18:47:09 +0000114/* FIXME: this assumes ASCII. create tables in init_sre() instead */
115
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000116static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1180, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
11925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1210, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
123
Fredrik Lundhb389df32000-06-29 12:48:37 +0000124static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
Fredrik Lundh436c3d582000-06-29 08:58:44 +000012510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
12627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
12744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
12861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
129108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
130122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
131106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
132120, 121, 122, 123, 124, 125, 126, 127 };
133
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000134#define SRE_IS_DIGIT(ch)\
135 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
136#define SRE_IS_SPACE(ch)\
137 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
138#define SRE_IS_LINEBREAK(ch)\
139 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
140#define SRE_IS_ALNUM(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
142#define SRE_IS_WORD(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
Guido van Rossumb700df92000-03-31 14:59:30 +0000144
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000145static unsigned int sre_lower(unsigned int ch)
146{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000147 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000148}
149
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150/* locale-specific character predicates */
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000151/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152 * warnings when c's type supports only numbers < N+1 */
153#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
154#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000155#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000156#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000157#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
158
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000159static unsigned int sre_lower_locale(unsigned int ch)
160{
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000161 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000162}
163
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000164/* unicode-specific character predicates */
165
Victor Stinner0058b862011-09-29 03:27:47 +0200166#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
167#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
168#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
169#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
170#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171
172static unsigned int sre_lower_unicode(unsigned int ch)
173{
Victor Stinner0058b862011-09-29 03:27:47 +0200174 return (unsigned int) Py_UNICODE_TOLOWER(ch);
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000175}
176
Guido van Rossumb700df92000-03-31 14:59:30 +0000177LOCAL(int)
178sre_category(SRE_CODE category, unsigned int ch)
179{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000180 switch (category) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000181
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000182 case SRE_CATEGORY_DIGIT:
183 return SRE_IS_DIGIT(ch);
184 case SRE_CATEGORY_NOT_DIGIT:
185 return !SRE_IS_DIGIT(ch);
186 case SRE_CATEGORY_SPACE:
187 return SRE_IS_SPACE(ch);
188 case SRE_CATEGORY_NOT_SPACE:
189 return !SRE_IS_SPACE(ch);
190 case SRE_CATEGORY_WORD:
191 return SRE_IS_WORD(ch);
192 case SRE_CATEGORY_NOT_WORD:
193 return !SRE_IS_WORD(ch);
194 case SRE_CATEGORY_LINEBREAK:
195 return SRE_IS_LINEBREAK(ch);
196 case SRE_CATEGORY_NOT_LINEBREAK:
197 return !SRE_IS_LINEBREAK(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000198
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000199 case SRE_CATEGORY_LOC_WORD:
200 return SRE_LOC_IS_WORD(ch);
201 case SRE_CATEGORY_LOC_NOT_WORD:
202 return !SRE_LOC_IS_WORD(ch);
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000203
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000204 case SRE_CATEGORY_UNI_DIGIT:
205 return SRE_UNI_IS_DIGIT(ch);
206 case SRE_CATEGORY_UNI_NOT_DIGIT:
207 return !SRE_UNI_IS_DIGIT(ch);
208 case SRE_CATEGORY_UNI_SPACE:
209 return SRE_UNI_IS_SPACE(ch);
210 case SRE_CATEGORY_UNI_NOT_SPACE:
211 return !SRE_UNI_IS_SPACE(ch);
212 case SRE_CATEGORY_UNI_WORD:
213 return SRE_UNI_IS_WORD(ch);
214 case SRE_CATEGORY_UNI_NOT_WORD:
215 return !SRE_UNI_IS_WORD(ch);
216 case SRE_CATEGORY_UNI_LINEBREAK:
217 return SRE_UNI_IS_LINEBREAK(ch);
218 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
219 return !SRE_UNI_IS_LINEBREAK(ch);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000220 }
221 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000222}
223
224/* helpers */
225
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000226static void
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227data_stack_dealloc(SRE_STATE* state)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000228{
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 if (state->data_stack) {
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230 PyMem_FREE(state->data_stack);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000231 state->data_stack = NULL;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000232 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000233 state->data_stack_size = state->data_stack_base = 0;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000234}
235
236static int
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000237data_stack_grow(SRE_STATE* state, Py_ssize_t size)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000238{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000239 Py_ssize_t minsize, cursize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000240 minsize = state->data_stack_base+size;
241 cursize = state->data_stack_size;
242 if (cursize < minsize) {
243 void* stack;
244 cursize = minsize+minsize/4+1024;
245 TRACE(("allocate/grow stack %d\n", cursize));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000246 stack = PyMem_REALLOC(state->data_stack, cursize);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000247 if (!stack) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000248 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000249 return SRE_ERROR_MEMORY;
250 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251 state->data_stack = (char *)stack;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000252 state->data_stack_size = cursize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000253 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000254 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000255}
256
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000257/* generate 8-bit version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000258
259#define SRE_CHAR unsigned char
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200260#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index]
Guido van Rossumb700df92000-03-31 14:59:30 +0000261#define SRE_AT sre_at
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000262#define SRE_COUNT sre_count
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000263#define SRE_CHARSET sre_charset
264#define SRE_INFO sre_info
Guido van Rossumb700df92000-03-31 14:59:30 +0000265#define SRE_MATCH sre_match
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000266#define SRE_MATCH_CONTEXT sre_match_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000267#define SRE_SEARCH sre_search
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000268
Guido van Rossumb700df92000-03-31 14:59:30 +0000269#define SRE_RECURSIVE
Guido van Rossumb700df92000-03-31 14:59:30 +0000270#include "_sre.c"
Guido van Rossumb700df92000-03-31 14:59:30 +0000271#undef SRE_RECURSIVE
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000272
Guido van Rossumb700df92000-03-31 14:59:30 +0000273#undef SRE_SEARCH
274#undef SRE_MATCH
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000275#undef SRE_MATCH_CONTEXT
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000276#undef SRE_INFO
277#undef SRE_CHARSET
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000278#undef SRE_COUNT
Guido van Rossumb700df92000-03-31 14:59:30 +0000279#undef SRE_AT
280#undef SRE_CHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281#undef SRE_CHARGET
Guido van Rossumb700df92000-03-31 14:59:30 +0000282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200283/* generate 8/16/32-bit unicode version */
Guido van Rossumb700df92000-03-31 14:59:30 +0000284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285#define SRE_CHAR void
286#define SRE_CHARGET(state, buf, index) \
287 ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \
288 (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \
289 ((Py_UCS4*)buf)[index])
Guido van Rossumb700df92000-03-31 14:59:30 +0000290#define SRE_AT sre_uat
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000291#define SRE_COUNT sre_ucount
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000292#define SRE_CHARSET sre_ucharset
293#define SRE_INFO sre_uinfo
Guido van Rossumb700df92000-03-31 14:59:30 +0000294#define SRE_MATCH sre_umatch
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000295#define SRE_MATCH_CONTEXT sre_umatch_context
Guido van Rossumb700df92000-03-31 14:59:30 +0000296#define SRE_SEARCH sre_usearch
297
298#endif /* SRE_RECURSIVE */
299
300/* -------------------------------------------------------------------- */
301/* String matching engine */
302
303/* the following section is compiled twice, with different character
304 settings */
305
306LOCAL(int)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200307SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at)
Guido van Rossumb700df92000-03-31 14:59:30 +0000308{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000309 /* check if pointer is at given position */
Guido van Rossumb700df92000-03-31 14:59:30 +0000310
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000311 Py_ssize_t thisp, thatp;
Guido van Rossumb700df92000-03-31 14:59:30 +0000312
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000313 switch (at) {
Fredrik Lundh80946112000-06-29 18:03:25 +0000314
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000315 case SRE_AT_BEGINNING:
Fredrik Lundh770617b2001-01-14 15:06:11 +0000316 case SRE_AT_BEGINNING_STRING:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000317 return ((void*) ptr == state->beginning);
Fredrik Lundh80946112000-06-29 18:03:25 +0000318
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000319 case SRE_AT_BEGINNING_LINE:
320 return ((void*) ptr == state->beginning ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000322
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000323 case SRE_AT_END:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200324 return (((void*) (ptr+state->charsize) == state->end &&
325 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) ||
Fredrik Lundhef34bd22000-06-30 21:40:20 +0000326 ((void*) ptr == state->end));
Fredrik Lundh80946112000-06-29 18:03:25 +0000327
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000328 case SRE_AT_END_LINE:
329 return ((void*) ptr == state->end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0)));
Fredrik Lundh80946112000-06-29 18:03:25 +0000331
Fredrik Lundh770617b2001-01-14 15:06:11 +0000332 case SRE_AT_END_STRING:
333 return ((void*) ptr == state->end);
334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000335 case SRE_AT_BOUNDARY:
336 if (state->beginning == state->end)
337 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000338 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200339 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200341 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000342 return thisp != thatp;
Fredrik Lundh80946112000-06-29 18:03:25 +0000343
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000344 case SRE_AT_NON_BOUNDARY:
345 if (state->beginning == state->end)
346 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200348 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000349 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000352
353 case SRE_AT_LOC_BOUNDARY:
354 if (state->beginning == state->end)
355 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000361
362 case SRE_AT_LOC_NON_BOUNDARY:
363 if (state->beginning == state->end)
364 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000365 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200368 SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 return thisp == thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000370
371 case SRE_AT_UNI_BOUNDARY:
372 if (state->beginning == state->end)
373 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000378 return thisp != thatp;
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000379
380 case SRE_AT_UNI_NON_BOUNDARY:
381 if (state->beginning == state->end)
382 return 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 thatp = ((void*) ptr > state->beginning) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 thisp = ((void*) ptr < state->end) ?
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200386 SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387 return thisp == thatp;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +0000388
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000389 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000390
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000391 return 0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000392}
393
394LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000395SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
Guido van Rossumb700df92000-03-31 14:59:30 +0000396{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000397 /* check if character is a member of the given set */
Guido van Rossumb700df92000-03-31 14:59:30 +0000398
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000399 int ok = 1;
Guido van Rossumb700df92000-03-31 14:59:30 +0000400
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000401 for (;;) {
402 switch (*set++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000403
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000404 case SRE_OP_FAILURE:
405 return !ok;
406
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000407 case SRE_OP_LITERAL:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000408 /* <LITERAL> <code> */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000409 if (ch == set[0])
410 return ok;
411 set++;
412 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000414 case SRE_OP_CATEGORY:
415 /* <CATEGORY> <code> */
416 if (sre_category(set[0], (int) ch))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000417 return ok;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000418 set += 1;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000419 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000420
Fredrik Lundh3562f112000-07-02 12:00:07 +0000421 case SRE_OP_CHARSET:
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000422 if (sizeof(SRE_CODE) == 2) {
423 /* <CHARSET> <bitmap> (16 bits per code word) */
424 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
425 return ok;
426 set += 16;
Tim Peters3d563502006-01-21 02:47:53 +0000427 }
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000428 else {
429 /* <CHARSET> <bitmap> (32 bits per code word) */
Gregory P. Smith90555d02012-12-10 17:44:44 -0800430 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000431 return ok;
432 set += 8;
433 }
Fredrik Lundh3562f112000-07-02 12:00:07 +0000434 break;
435
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000436 case SRE_OP_RANGE:
437 /* <RANGE> <lower> <upper> */
438 if (set[0] <= ch && ch <= set[1])
439 return ok;
440 set += 2;
441 break;
442
443 case SRE_OP_NEGATE:
444 ok = !ok;
445 break;
446
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000447 case SRE_OP_BIGCHARSET:
448 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
449 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000450 Py_ssize_t count, block;
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000451 count = *(set++);
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000452
453 if (sizeof(SRE_CODE) == 2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000455 set += 128;
456 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
457 return ok;
458 set += count*16;
459 }
460 else {
Gustavo Niemeyer601b9632004-02-14 00:31:13 +0000461 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
462 * warnings when c's type supports only numbers < N+1 */
463 if (!(ch & ~65535))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464 block = ((char*)set)[ch >> 8];
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000465 else
466 block = -1;
467 set += 64;
Tim Peters3d563502006-01-21 02:47:53 +0000468 if (block >=0 &&
Gregory P. Smith90555d02012-12-10 17:44:44 -0800469 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
Martin v. Löwis78e2f062003-04-19 12:56:08 +0000470 return ok;
471 set += count*8;
472 }
Fredrik Lundhf71ae462001-07-02 17:04:48 +0000473 break;
474 }
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000475
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000476 default:
477 /* internal error -- there's not much we can do about it
Fredrik Lundh80946112000-06-29 18:03:25 +0000478 here, so let's just pretend it didn't match... */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000479 return 0;
480 }
481 }
Guido van Rossumb700df92000-03-31 14:59:30 +0000482}
483
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000484LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000485
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000486LOCAL(Py_ssize_t)
487SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000488{
489 SRE_CODE chr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200490 char* ptr = (char *)state->ptr;
491 char* end = (char *)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000492 Py_ssize_t i;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000493
494 /* adjust end */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200495 if (maxcount < (end - ptr) / state->charsize && maxcount != 65535)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200496 end = ptr + maxcount*state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000497
498 switch (pattern[0]) {
499
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000500 case SRE_OP_IN:
501 /* repeated set */
502 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
Victor Stinner63ab8752011-11-22 03:31:20 +0100503 while (ptr < end &&
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504 SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0)))
505 ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000506 break;
507
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000508 case SRE_OP_ANY:
509 /* repeated dot wildcard. */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000510 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200511 while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0)))
512 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000513 break;
514
515 case SRE_OP_ANY_ALL:
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000516 /* repeated dot wildcard. skip to the end of the target
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000517 string, and backtrack from there */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000518 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000519 ptr = end;
520 break;
521
522 case SRE_OP_LITERAL:
523 /* repeated literal */
524 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000525 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200526 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr)
527 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000528 break;
529
530 case SRE_OP_LITERAL_IGNORE:
531 /* repeated literal */
532 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000533 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr)
535 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000536 break;
537
538 case SRE_OP_NOT_LITERAL:
539 /* repeated non-literal */
540 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
543 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000544 break;
Tim Peters3d563502006-01-21 02:47:53 +0000545
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000546 case SRE_OP_NOT_LITERAL_IGNORE:
547 /* repeated non-literal */
548 chr = pattern[1];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000549 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr)
551 ptr += state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000552 break;
553
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000554 default:
555 /* repeated single character pattern */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000556 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 while ((char*) state->ptr < end) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000558 i = SRE_MATCH(state, pattern);
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000559 if (i < 0)
560 return i;
561 if (!i)
562 break;
563 }
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000564 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 ((char*)state->ptr - ptr)/state->charsize));
566 return ((char*)state->ptr - ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000567 }
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, (ptr - (char*) state->ptr)/state->charsize));
570 return (ptr - (char*) state->ptr)/state->charsize;
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000571}
572
Fredrik Lundh33accc12000-08-27 20:59:47 +0000573#if 0 /* not used in this release */
Guido van Rossumb700df92000-03-31 14:59:30 +0000574LOCAL(int)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000575SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
576{
577 /* check if an SRE_OP_INFO block matches at the current position.
578 returns the number of SRE_CODE objects to skip if successful, 0
579 if no match */
580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200581 char* end = state->end;
582 char* ptr = state->ptr;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000583 Py_ssize_t i;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000584
585 /* check minimal length */
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200586 if (pattern[3] && (end - ptr)/state->charsize < pattern[3])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000587 return 0;
588
589 /* check known prefix */
590 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
591 /* <length> <skip> <prefix data> <overlap data> */
592 for (i = 0; i < pattern[5]; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593 if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000594 return 0;
595 return pattern[0] + 2 * pattern[6];
596 }
597 return pattern[0];
598}
Fredrik Lundh33accc12000-08-27 20:59:47 +0000599#endif
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000600
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000601/* The macros below should be used to protect recursive SRE_MATCH()
602 * calls that *failed* and do *not* return immediately (IOW, those
603 * that will backtrack). Explaining:
604 *
605 * - Recursive SRE_MATCH() returned true: that's usually a success
606 * (besides atypical cases like ASSERT_NOT), therefore there's no
607 * reason to restore lastmark;
608 *
609 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
610 * is returning to the caller: If the current SRE_MATCH() is the
611 * top function of the recursion, returning false will be a matching
612 * failure, and it doesn't matter where lastmark is pointing to.
613 * If it's *not* the top function, it will be a recursive SRE_MATCH()
614 * failure by itself, and the calling SRE_MATCH() will have to deal
615 * with the failure by the same rules explained here (it will restore
616 * lastmark by itself if necessary);
617 *
618 * - Recursive SRE_MATCH() returned false, and will continue the
619 * outside 'for' loop: must be protected when breaking, since the next
620 * OP could potentially depend on lastmark;
Tim Peters3d563502006-01-21 02:47:53 +0000621 *
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +0000622 * - Recursive SRE_MATCH() returned false, and will be called again
623 * inside a local for/while loop: must be protected between each
624 * loop iteration, since the recursive SRE_MATCH() could do anything,
625 * and could potentially depend on lastmark.
626 *
627 * For more information, check the discussion at SF patch #712900.
628 */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000629#define LASTMARK_SAVE() \
630 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000631 ctx->lastmark = state->lastmark; \
632 ctx->lastindex = state->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000633 } while (0)
634#define LASTMARK_RESTORE() \
635 do { \
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000636 state->lastmark = ctx->lastmark; \
637 state->lastindex = ctx->lastindex; \
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000638 } while (0)
639
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000640#define RETURN_ERROR(i) do { return i; } while(0)
641#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
642#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
643
644#define RETURN_ON_ERROR(i) \
645 do { if (i < 0) RETURN_ERROR(i); } while (0)
646#define RETURN_ON_SUCCESS(i) \
647 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
648#define RETURN_ON_FAILURE(i) \
649 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
650
651#define SFY(x) #x
652
653#define DATA_STACK_ALLOC(state, type, ptr) \
654do { \
655 alloc_pos = state->data_stack_base; \
656 TRACE(("allocating %s in %d (%d)\n", \
657 SFY(type), alloc_pos, sizeof(type))); \
658 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
659 int j = data_stack_grow(state, sizeof(type)); \
660 if (j < 0) return j; \
661 if (ctx_pos != -1) \
662 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
663 } \
664 ptr = (type*)(state->data_stack+alloc_pos); \
665 state->data_stack_base += sizeof(type); \
666} while (0)
667
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000668#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
669do { \
670 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
671 ptr = (type*)(state->data_stack+pos); \
672} while (0)
673
674#define DATA_STACK_PUSH(state, data, size) \
675do { \
676 TRACE(("copy data in %p to %d (%d)\n", \
677 data, state->data_stack_base, size)); \
678 if (state->data_stack_size < state->data_stack_base+size) { \
679 int j = data_stack_grow(state, size); \
680 if (j < 0) return j; \
681 if (ctx_pos != -1) \
682 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
683 } \
684 memcpy(state->data_stack+state->data_stack_base, data, size); \
685 state->data_stack_base += size; \
686} while (0)
687
688#define DATA_STACK_POP(state, data, size, discard) \
689do { \
690 TRACE(("copy data to %p from %d (%d)\n", \
691 data, state->data_stack_base-size, size)); \
692 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
693 if (discard) \
694 state->data_stack_base -= size; \
695} while (0)
696
697#define DATA_STACK_POP_DISCARD(state, size) \
698do { \
699 TRACE(("discard data from %d (%d)\n", \
700 state->data_stack_base-size, size)); \
701 state->data_stack_base -= size; \
702} while(0)
703
704#define DATA_PUSH(x) \
705 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
706#define DATA_POP(x) \
707 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000708#define DATA_POP_DISCARD(x) \
709 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
710#define DATA_ALLOC(t,p) \
711 DATA_STACK_ALLOC(state, t, p)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000712#define DATA_LOOKUP_AT(t,p,pos) \
713 DATA_STACK_LOOKUP_AT(state,t,p,pos)
714
715#define MARK_PUSH(lastmark) \
716 do if (lastmark > 0) { \
717 i = lastmark; /* ctx->lastmark may change if reallocated */ \
718 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
719 } while (0)
720#define MARK_POP(lastmark) \
721 do if (lastmark > 0) { \
722 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
723 } while (0)
724#define MARK_POP_KEEP(lastmark) \
725 do if (lastmark > 0) { \
726 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
727 } while (0)
728#define MARK_POP_DISCARD(lastmark) \
729 do if (lastmark > 0) { \
730 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
731 } while (0)
732
733#define JUMP_NONE 0
734#define JUMP_MAX_UNTIL_1 1
735#define JUMP_MAX_UNTIL_2 2
736#define JUMP_MAX_UNTIL_3 3
737#define JUMP_MIN_UNTIL_1 4
738#define JUMP_MIN_UNTIL_2 5
739#define JUMP_MIN_UNTIL_3 6
740#define JUMP_REPEAT 7
741#define JUMP_REPEAT_ONE_1 8
742#define JUMP_REPEAT_ONE_2 9
743#define JUMP_MIN_REPEAT_ONE 10
744#define JUMP_BRANCH 11
745#define JUMP_ASSERT 12
746#define JUMP_ASSERT_NOT 13
747
748#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
749 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
750 nextctx->last_ctx_pos = ctx_pos; \
751 nextctx->jump = jumpvalue; \
752 nextctx->pattern = nextpattern; \
753 ctx_pos = alloc_pos; \
754 ctx = nextctx; \
755 goto entrance; \
756 jumplabel: \
757 while (0) /* gcc doesn't like labels at end of scopes */ \
758
759typedef struct {
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000760 Py_ssize_t last_ctx_pos;
761 Py_ssize_t jump;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200762 char* ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000763 SRE_CODE* pattern;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000764 Py_ssize_t count;
765 Py_ssize_t lastmark;
766 Py_ssize_t lastindex;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000767 union {
768 SRE_CODE chr;
769 SRE_REPEAT* rep;
770 } u;
771} SRE_MATCH_CONTEXT;
772
773/* check if string matches the given pattern. returns <0 for
774 error, 0 for failure, and 1 for success */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000775LOCAL(Py_ssize_t)
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000776SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
Guido van Rossumb700df92000-03-31 14:59:30 +0000777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200778 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000779 Py_ssize_t alloc_pos, ctx_pos = -1;
780 Py_ssize_t i, ret = 0;
781 Py_ssize_t jump;
Christian Heimes2380ac72008-01-09 00:17:24 +0000782 unsigned int sigcount=0;
Guido van Rossumb700df92000-03-31 14:59:30 +0000783
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000784 SRE_MATCH_CONTEXT* ctx;
785 SRE_MATCH_CONTEXT* nextctx;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000786
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +0000787 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
Fredrik Lundh96ab4652000-08-03 16:29:50 +0000788
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
790 ctx->last_ctx_pos = -1;
791 ctx->jump = JUMP_NONE;
792 ctx->pattern = pattern;
793 ctx_pos = alloc_pos;
794
795entrance:
796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200797 ctx->ptr = (char *)state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798
799 if (ctx->pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000800 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000801 /* <INFO> <1=skip> <2=flags> <3=min> ... */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000803 TRACE(("reject (got %d chars, need %d)\n",
Serhiy Storchakac1b59d42012-12-29 23:38:48 +0200804 (end - ctx->ptr)/state->charsize, ctx->pattern[3]));
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000805 RETURN_FAILURE;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000806 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000807 ctx->pattern += ctx->pattern[1] + 1;
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000808 }
809
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000810 for (;;) {
Christian Heimes2380ac72008-01-09 00:17:24 +0000811 ++sigcount;
812 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
813 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
Guido van Rossumb700df92000-03-31 14:59:30 +0000814
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000815 switch (*ctx->pattern++) {
Guido van Rossumb700df92000-03-31 14:59:30 +0000816
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000817 case SRE_OP_MARK:
818 /* set mark */
819 /* <MARK> <gid> */
820 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
821 ctx->ptr, ctx->pattern[0]));
822 i = ctx->pattern[0];
823 if (i & 1)
824 state->lastindex = i/2 + 1;
825 if (i > state->lastmark) {
826 /* state->lastmark is the highest valid index in the
827 state->mark array. If it is increased by more than 1,
828 the intervening marks must be set to NULL to signal
Tim Peters3d563502006-01-21 02:47:53 +0000829 that these marks have not been encountered. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000830 Py_ssize_t j = state->lastmark + 1;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000831 while (j < i)
832 state->mark[j++] = NULL;
833 state->lastmark = i;
834 }
835 state->mark[i] = ctx->ptr;
836 ctx->pattern++;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000837 break;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000838
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000839 case SRE_OP_LITERAL:
840 /* match literal string */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000841 /* <LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000842 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
843 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000845 RETURN_FAILURE;
846 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000848 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000849
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000850 case SRE_OP_NOT_LITERAL:
851 /* match anything that is not literal character */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000852 /* <NOT_LITERAL> <code> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000853 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
854 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855 if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000856 RETURN_FAILURE;
857 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000859 break;
860
861 case SRE_OP_SUCCESS:
862 /* end of pattern */
863 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
864 state->ptr = ctx->ptr;
865 RETURN_SUCCESS;
866
867 case SRE_OP_AT:
868 /* match at given position */
869 /* <AT> <code> */
870 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
871 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
872 RETURN_FAILURE;
873 ctx->pattern++;
874 break;
875
876 case SRE_OP_CATEGORY:
877 /* match at given category */
878 /* <CATEGORY> <code> */
879 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000882 RETURN_FAILURE;
883 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000885 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000886
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000887 case SRE_OP_ANY:
Fredrik Lundhe1869832000-08-01 22:47:49 +0000888 /* match anything (except a newline) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000889 /* <ANY> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000890 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0)))
892 RETURN_FAILURE;
893 ctx->ptr += state->charsize;
Fredrik Lundhe1869832000-08-01 22:47:49 +0000894 break;
895
896 case SRE_OP_ANY_ALL:
897 /* match anything */
898 /* <ANY_ALL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000899 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
900 if (ctx->ptr >= end)
901 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200902 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000903 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000904
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000905 case SRE_OP_IN:
906 /* match set member (or non_member) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000907 /* <IN> <skip> <set> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000908 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0)))
910 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000911 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000913 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000914
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000915 case SRE_OP_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000916 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
917 ctx->pattern, ctx->ptr, ctx->pattern[0]));
918 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000920 RETURN_FAILURE;
921 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000923 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000924
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000925 case SRE_OP_NOT_LITERAL_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000926 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
927 ctx->pattern, ctx->ptr, *ctx->pattern));
928 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000930 RETURN_FAILURE;
931 ctx->pattern++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200932 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000933 break;
Guido van Rossumb700df92000-03-31 14:59:30 +0000934
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000935 case SRE_OP_IN_IGNORE:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000936 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
937 if (ctx->ptr >= end
938 || !SRE_CHARSET(ctx->pattern+1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939 (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0))))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000940 RETURN_FAILURE;
941 ctx->pattern += ctx->pattern[0];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 ctx->ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000943 break;
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000944
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000945 case SRE_OP_JUMP:
946 case SRE_OP_INFO:
947 /* jump forward */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000948 /* <JUMP> <offset> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000949 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
950 ctx->ptr, ctx->pattern[0]));
951 ctx->pattern += ctx->pattern[0];
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000952 break;
953
954 case SRE_OP_BRANCH:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000955 /* alternation */
956 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000957 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000958 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000959 ctx->u.rep = state->repeat;
960 if (ctx->u.rep)
961 MARK_PUSH(ctx->lastmark);
962 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
963 if (ctx->pattern[1] == SRE_OP_LITERAL &&
964 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2]))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000966 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000967 if (ctx->pattern[1] == SRE_OP_IN &&
968 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0))))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000970 continue;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000971 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000972 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000973 if (ret) {
974 if (ctx->u.rep)
975 MARK_POP_DISCARD(ctx->lastmark);
976 RETURN_ON_ERROR(ret);
977 RETURN_SUCCESS;
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000978 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000979 if (ctx->u.rep)
980 MARK_POP_KEEP(ctx->lastmark);
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +0000981 LASTMARK_RESTORE();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000982 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000983 if (ctx->u.rep)
984 MARK_POP_DISCARD(ctx->lastmark);
985 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +0000986
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000987 case SRE_OP_REPEAT_ONE:
988 /* match repeated sequence (maximizing regexp) */
989
990 /* this operator only works if the repeated item is
991 exactly one character wide, and we're not already
992 collecting backtracking points. for other cases,
Fredrik Lundh770617b2001-01-14 15:06:11 +0000993 use the MAX_REPEAT operator */
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000994
995 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
996
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000997 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
998 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 if (ctx->ptr + state->charsize * ctx->pattern[1] > end)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001001 RETURN_FAILURE; /* cannot match */
Fredrik Lundhe1869832000-08-01 22:47:49 +00001002
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001003 state->ptr = ctx->ptr;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001004
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001005 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1006 RETURN_ON_ERROR(ret);
1007 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1008 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 ctx->ptr += state->charsize * ctx->count;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001010
1011 /* when we arrive here, count contains the number of
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001012 matches, and ctx->ptr points to the tail of the target
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001013 string. check if the rest of the pattern matches,
1014 and backtrack if not. */
1015
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001016 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001017 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001018
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001019 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001020 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001021 state->ptr = ctx->ptr;
1022 RETURN_SUCCESS;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001023 }
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001024
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001025 LASTMARK_SAVE();
1026
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001027 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001028 /* tail starts with a literal. skip positions where
1029 the rest of the pattern cannot possibly match */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001030 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001031 for (;;) {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001032 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
Victor Stinner63ab8752011-11-22 03:31:20 +01001033 (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) {
1035 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001036 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001037 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001038 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001039 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001040 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001041 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1042 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001043 if (ret) {
1044 RETURN_ON_ERROR(ret);
1045 RETURN_SUCCESS;
1046 }
Tim Peters3d563502006-01-21 02:47:53 +00001047
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001048 LASTMARK_RESTORE();
Tim Peters3d563502006-01-21 02:47:53 +00001049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001051 ctx->count--;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001052 }
1053
1054 } else {
1055 /* general case */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001056 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001057 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001058 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1059 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001060 if (ret) {
1061 RETURN_ON_ERROR(ret);
1062 RETURN_SUCCESS;
1063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ctx->ptr -= state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 ctx->count--;
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001066 LASTMARK_RESTORE();
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001067 }
1068 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 RETURN_FAILURE;
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +00001070
Guido van Rossum41c99e72003-04-14 17:59:34 +00001071 case SRE_OP_MIN_REPEAT_ONE:
1072 /* match repeated sequence (minimizing regexp) */
1073
1074 /* this operator only works if the repeated item is
1075 exactly one character wide, and we're not already
1076 collecting backtracking points. for other cases,
1077 use the MIN_REPEAT operator */
1078
1079 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1080
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001081 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1082 ctx->pattern[1], ctx->pattern[2]));
Guido van Rossum41c99e72003-04-14 17:59:34 +00001083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 if (ctx->ptr + state->charsize * ctx->pattern[1] > end)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001085 RETURN_FAILURE; /* cannot match */
Guido van Rossum41c99e72003-04-14 17:59:34 +00001086
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001087 state->ptr = ctx->ptr;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001088
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001089 if (ctx->pattern[1] == 0)
1090 ctx->count = 0;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001091 else {
1092 /* count using pattern min as the maximum */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001093 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1094 RETURN_ON_ERROR(ret);
1095 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001096 if (ret < (Py_ssize_t) ctx->pattern[1])
Tim Peters3d563502006-01-21 02:47:53 +00001097 /* didn't match minimum number of times */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001098 RETURN_FAILURE;
1099 /* advance past minimum matches of repeat */
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001100 ctx->count = ret;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 ctx->ptr += state->charsize * ctx->count;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001102 }
1103
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001104 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
Guido van Rossum41c99e72003-04-14 17:59:34 +00001105 /* tail is empty. we're finished */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001106 state->ptr = ctx->ptr;
1107 RETURN_SUCCESS;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001108
1109 } else {
1110 /* general case */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00001111 LASTMARK_SAVE();
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001112 while ((Py_ssize_t)ctx->pattern[2] == 65535
1113 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001114 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1116 ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001117 if (ret) {
1118 RETURN_ON_ERROR(ret);
1119 RETURN_SUCCESS;
1120 }
1121 state->ptr = ctx->ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001122 ret = SRE_COUNT(state, ctx->pattern+3, 1);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001123 RETURN_ON_ERROR(ret);
Gustavo Niemeyer166878f2004-12-02 16:15:39 +00001124 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001125 if (ret == 0)
Guido van Rossum41c99e72003-04-14 17:59:34 +00001126 break;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001127 assert(ret == 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001129 ctx->count++;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001130 LASTMARK_RESTORE();
Guido van Rossum41c99e72003-04-14 17:59:34 +00001131 }
Guido van Rossum41c99e72003-04-14 17:59:34 +00001132 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001133 RETURN_FAILURE;
Guido van Rossum41c99e72003-04-14 17:59:34 +00001134
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001135 case SRE_OP_REPEAT:
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001136 /* create repeat context. all the hard work is done
Fredrik Lundh770617b2001-01-14 15:06:11 +00001137 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001138 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001139 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1140 ctx->pattern[1], ctx->pattern[2]));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001141
1142 /* install new repeat context */
Thomas Wouters477c8d52006-05-27 19:21:47 +00001143 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001144 if (!ctx->u.rep) {
1145 PyErr_NoMemory();
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00001146 RETURN_FAILURE;
Thomas Wouters89f507f2006-12-13 04:49:30 +00001147 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001148 ctx->u.rep->count = -1;
1149 ctx->u.rep->pattern = ctx->pattern;
1150 ctx->u.rep->prev = state->repeat;
1151 ctx->u.rep->last_ptr = NULL;
1152 state->repeat = ctx->u.rep;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001153
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001154 state->ptr = ctx->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001155 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001156 state->repeat = ctx->u.rep->prev;
Thomas Wouters477c8d52006-05-27 19:21:47 +00001157 PyObject_FREE(ctx->u.rep);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001158
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001159 if (ret) {
1160 RETURN_ON_ERROR(ret);
1161 RETURN_SUCCESS;
1162 }
1163 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001164
1165 case SRE_OP_MAX_UNTIL:
1166 /* maximizing repeat */
1167 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1168
1169 /* FIXME: we probably need to deal with zero-width
1170 matches in here... */
1171
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001172 ctx->u.rep = state->repeat;
1173 if (!ctx->u.rep)
1174 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001175
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001176 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001177
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001178 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001179
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001180 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1181 ctx->ptr, ctx->count));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001182
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001183 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001184 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001185 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1187 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 if (ret) {
1189 RETURN_ON_ERROR(ret);
1190 RETURN_SUCCESS;
1191 }
1192 ctx->u.rep->count = ctx->count-1;
1193 state->ptr = ctx->ptr;
1194 RETURN_FAILURE;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001195 }
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001196
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 if ((ctx->count < ctx->u.rep->pattern[2] ||
1198 ctx->u.rep->pattern[2] == 65535) &&
1199 state->ptr != ctx->u.rep->last_ptr) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001200 /* we may have enough matches, but if we can
1201 match another item, do so */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001202 ctx->u.rep->count = ctx->count;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001203 LASTMARK_SAVE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001204 MARK_PUSH(ctx->lastmark);
1205 /* zero-width match protection */
1206 DATA_PUSH(&ctx->u.rep->last_ptr);
1207 ctx->u.rep->last_ptr = state->ptr;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001208 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1209 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001210 DATA_POP(&ctx->u.rep->last_ptr);
1211 if (ret) {
1212 MARK_POP_DISCARD(ctx->lastmark);
1213 RETURN_ON_ERROR(ret);
1214 RETURN_SUCCESS;
1215 }
1216 MARK_POP(ctx->lastmark);
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001217 LASTMARK_RESTORE();
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001220 }
1221
1222 /* cannot match more repeated items here. make sure the
1223 tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001224 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001225 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001226 RETURN_ON_SUCCESS(ret);
1227 state->repeat = ctx->u.rep;
1228 state->ptr = ctx->ptr;
1229 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001230
1231 case SRE_OP_MIN_UNTIL:
1232 /* minimizing repeat */
1233 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1234
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001235 ctx->u.rep = state->repeat;
1236 if (!ctx->u.rep)
1237 RETURN_ERROR(SRE_ERROR_STATE);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001238
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001239 state->ptr = ctx->ptr;
Gustavo Niemeyer3c9068b2003-04-22 15:39:09 +00001240
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001241 ctx->count = ctx->u.rep->count+1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001242
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001243 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1244 ctx->ptr, ctx->count, ctx->u.rep->pattern));
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001245
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001246 if (ctx->count < ctx->u.rep->pattern[1]) {
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001247 /* not enough matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001248 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001249 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1250 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001251 if (ret) {
1252 RETURN_ON_ERROR(ret);
1253 RETURN_SUCCESS;
1254 }
1255 ctx->u.rep->count = ctx->count-1;
1256 state->ptr = ctx->ptr;
1257 RETURN_FAILURE;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001258 }
1259
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001260 LASTMARK_SAVE();
1261
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001262 /* see if the tail matches */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001263 state->repeat = ctx->u.rep->prev;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001264 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001265 if (ret) {
1266 RETURN_ON_ERROR(ret);
1267 RETURN_SUCCESS;
1268 }
Fredrik Lundhfa25a7d2001-01-14 23:55:55 +00001269
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001270 state->repeat = ctx->u.rep;
1271 state->ptr = ctx->ptr;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001272
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001273 LASTMARK_RESTORE();
1274
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001275 if (ctx->count >= ctx->u.rep->pattern[2]
1276 && ctx->u.rep->pattern[2] != 65535)
1277 RETURN_FAILURE;
Gustavo Niemeyercaf1c9d2003-04-27 14:42:54 +00001278
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001279 ctx->u.rep->count = ctx->count;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001280 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1281 ctx->u.rep->pattern+3);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001282 if (ret) {
1283 RETURN_ON_ERROR(ret);
1284 RETURN_SUCCESS;
1285 }
1286 ctx->u.rep->count = ctx->count-1;
1287 state->ptr = ctx->ptr;
1288 RETURN_FAILURE;
1289
1290 case SRE_OP_GROUPREF:
1291 /* match backreference */
1292 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1293 ctx->ptr, ctx->pattern[0]));
1294 i = ctx->pattern[0];
1295 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001296 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001297 if (groupref >= state->lastmark) {
1298 RETURN_FAILURE;
1299 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001300 char* p = (char*) state->mark[groupref];
1301 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001302 if (!p || !e || e < p)
1303 RETURN_FAILURE;
1304 while (p < e) {
Victor Stinner63ab8752011-11-22 03:31:20 +01001305 if (ctx->ptr >= end ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001307 RETURN_FAILURE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 p += state->charsize;
1309 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001310 }
1311 }
1312 }
1313 ctx->pattern++;
1314 break;
1315
1316 case SRE_OP_GROUPREF_IGNORE:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1321 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001322 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 char* p = (char*) state->mark[groupref];
1327 char* e = (char*) state->mark[groupref+1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end ||
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001332 state->lower(SRE_CHARGET(state, ctx->ptr, 0)) !=
1333 state->lower(SRE_CHARGET(state, p, 0)))
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001334 RETURN_FAILURE;
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001335 p += state->charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 ctx->ptr += state->charsize;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001337 }
1338 }
1339 }
1340 ctx->pattern++;
1341 break;
1342
1343 case SRE_OP_GROUPREF_EXISTS:
1344 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1345 ctx->ptr, ctx->pattern[0]));
1346 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1347 i = ctx->pattern[0];
1348 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001349 Py_ssize_t groupref = i+i;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001350 if (groupref >= state->lastmark) {
1351 ctx->pattern += ctx->pattern[1];
1352 break;
1353 } else {
1354 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1355 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1356 if (!p || !e || e < p) {
1357 ctx->pattern += ctx->pattern[1];
1358 break;
1359 }
1360 }
1361 }
1362 ctx->pattern += 2;
1363 break;
1364
1365 case SRE_OP_ASSERT:
1366 /* assert subpattern */
1367 /* <ASSERT> <skip> <back> <pattern> */
1368 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1369 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001371 if (state->ptr < state->beginning)
1372 RETURN_FAILURE;
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001373 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001374 RETURN_ON_FAILURE(ret);
1375 ctx->pattern += ctx->pattern[0];
1376 break;
1377
1378 case SRE_OP_ASSERT_NOT:
1379 /* assert not subpattern */
1380 /* <ASSERT_NOT> <skip> <back> <pattern> */
1381 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[1]));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 state->ptr = ctx->ptr - state->charsize * ctx->pattern[1];
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001384 if (state->ptr >= state->beginning) {
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001385 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001386 if (ret) {
1387 RETURN_ON_ERROR(ret);
1388 RETURN_FAILURE;
1389 }
1390 }
1391 ctx->pattern += ctx->pattern[0];
1392 break;
1393
1394 case SRE_OP_FAILURE:
1395 /* immediate failure */
1396 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1397 RETURN_FAILURE;
Guido van Rossumb700df92000-03-31 14:59:30 +00001398
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001399 default:
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001400 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1401 ctx->pattern[-1]));
1402 RETURN_ERROR(SRE_ERROR_ILLEGAL);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001403 }
1404 }
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001405
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001406exit:
1407 ctx_pos = ctx->last_ctx_pos;
1408 jump = ctx->jump;
1409 DATA_POP_DISCARD(ctx);
1410 if (ctx_pos == -1)
1411 return ret;
1412 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1413
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001414 switch (jump) {
1415 case JUMP_MAX_UNTIL_2:
1416 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1417 goto jump_max_until_2;
1418 case JUMP_MAX_UNTIL_3:
1419 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1420 goto jump_max_until_3;
1421 case JUMP_MIN_UNTIL_2:
1422 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1423 goto jump_min_until_2;
1424 case JUMP_MIN_UNTIL_3:
1425 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1426 goto jump_min_until_3;
1427 case JUMP_BRANCH:
1428 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1429 goto jump_branch;
1430 case JUMP_MAX_UNTIL_1:
1431 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1432 goto jump_max_until_1;
1433 case JUMP_MIN_UNTIL_1:
1434 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1435 goto jump_min_until_1;
1436 case JUMP_REPEAT:
1437 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1438 goto jump_repeat;
1439 case JUMP_REPEAT_ONE_1:
1440 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1441 goto jump_repeat_one_1;
1442 case JUMP_REPEAT_ONE_2:
1443 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1444 goto jump_repeat_one_2;
1445 case JUMP_MIN_REPEAT_ONE:
1446 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1447 goto jump_min_repeat_one;
1448 case JUMP_ASSERT:
1449 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1450 goto jump_assert;
1451 case JUMP_ASSERT_NOT:
1452 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1453 goto jump_assert_not;
1454 case JUMP_NONE:
1455 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1456 break;
1457 }
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001458
1459 return ret; /* should never get here */
Guido van Rossumb700df92000-03-31 14:59:30 +00001460}
1461
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001462LOCAL(Py_ssize_t)
Guido van Rossumb700df92000-03-31 14:59:30 +00001463SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 char* ptr = (char*)state->start;
1466 char* end = (char*)state->end;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001467 Py_ssize_t status = 0;
1468 Py_ssize_t prefix_len = 0;
1469 Py_ssize_t prefix_skip = 0;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001470 SRE_CODE* prefix = NULL;
1471 SRE_CODE* charset = NULL;
1472 SRE_CODE* overlap = NULL;
1473 int flags = 0;
Guido van Rossumb700df92000-03-31 14:59:30 +00001474
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001475 if (pattern[0] == SRE_OP_INFO) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001476 /* optimization info block */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001477 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001478
1479 flags = pattern[2];
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001480
Gustavo Niemeyer28b5bb32003-06-26 14:41:08 +00001481 if (pattern[3] > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001482 /* adjust end point (but make sure we leave at least one
Fredrik Lundh3562f112000-07-02 12:00:07 +00001483 character in there, so literal search will work) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 end -= (pattern[3]-1) * state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001485 if (end <= ptr)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 end = ptr + state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001487 }
1488
Fredrik Lundh3562f112000-07-02 12:00:07 +00001489 if (flags & SRE_INFO_PREFIX) {
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001490 /* pattern starts with a known prefix */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001491 /* <length> <skip> <prefix data> <overlap data> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001492 prefix_len = pattern[5];
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001493 prefix_skip = pattern[6];
1494 prefix = pattern + 7;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001495 overlap = prefix + prefix_len - 1;
1496 } else if (flags & SRE_INFO_CHARSET)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +00001497 /* pattern starts with a character from a known set */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001498 /* <charset> */
Fredrik Lundh3562f112000-07-02 12:00:07 +00001499 charset = pattern + 5;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001500
1501 pattern += 1 + pattern[1];
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001502 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001503
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001504 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1505 TRACE(("charset = %p\n", charset));
1506
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001507#if defined(USE_FAST_SEARCH)
Fredrik Lundh28552902000-07-05 21:14:16 +00001508 if (prefix_len > 1) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001509 /* pattern starts with a known prefix. use the overlap
1510 table to skip forward as fast as we possibly can */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001511 Py_ssize_t i = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 end = (char *)state->end;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001513 while (ptr < end) {
1514 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) {
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001516 if (!i)
1517 break;
1518 else
1519 i = overlap[i];
1520 } else {
1521 if (++i == prefix_len) {
1522 /* found a potential match */
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001523 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 state->start = ptr - (prefix_len - 1) * state->charsize;
1525 state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001526 if (flags & SRE_INFO_LITERAL)
1527 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001528 status = SRE_MATCH(state, pattern + 2*prefix_skip);
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001529 if (status != 0)
1530 return status;
1531 /* close but no cigar -- try again */
1532 i = overlap[i];
1533 }
1534 break;
1535 }
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 ptr += state->charsize;
Fredrik Lundh29c08be2000-06-29 23:33:12 +00001538 }
1539 return 0;
1540 }
1541#endif
Fredrik Lundh80946112000-06-29 18:03:25 +00001542
Fredrik Lundh3562f112000-07-02 12:00:07 +00001543 if (pattern[0] == SRE_OP_LITERAL) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001544 /* pattern starts with a literal character. this is used
Fredrik Lundh3562f112000-07-02 12:00:07 +00001545 for short prefixes, and if fast search is disabled */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001546 SRE_CODE chr = pattern[1];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001548 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr)
1550 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001551 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001552 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001553 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001554 state->start = ptr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 ptr += state->charsize;
1556 state->ptr = ptr;
Fredrik Lundhdac58492001-10-21 21:48:30 +00001557 if (flags & SRE_INFO_LITERAL)
1558 return 1; /* we got all of it */
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001559 status = SRE_MATCH(state, pattern + 2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001560 if (status != 0)
1561 break;
Fredrik Lundh3562f112000-07-02 12:00:07 +00001562 }
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001563 } else if (charset) {
1564 /* pattern starts with a character from a known set */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 end = (char*)state->end;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001566 for (;;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0)))
1568 ptr += state->charsize;
Gustavo Niemeyerc523b042002-11-07 03:28:56 +00001569 if (ptr >= end)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001570 return 0;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001571 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001572 state->start = ptr;
1573 state->ptr = ptr;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001574 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001575 if (status != 0)
1576 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 ptr += state->charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001578 }
1579 } else
1580 /* general case */
1581 while (ptr <= end) {
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001582 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 state->start = state->ptr = ptr;
1584 ptr += state->charsize;
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001585 status = SRE_MATCH(state, pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001586 if (status != 0)
1587 break;
1588 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001589
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001590 return status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001591}
Tim Peters3d563502006-01-21 02:47:53 +00001592
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001593#if !defined(SRE_RECURSIVE)
Guido van Rossumb700df92000-03-31 14:59:30 +00001594
1595/* -------------------------------------------------------------------- */
1596/* factories and destructors */
1597
1598/* see sre.h for object declarations */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001599static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001600static PyObject*pattern_scanner(PatternObject*, PyObject*, PyObject* kw);
Guido van Rossumb700df92000-03-31 14:59:30 +00001601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602static int
1603sre_literal_template(int charsize, char* ptr, Py_ssize_t len)
1604{
1605 /* check if given string is a literal template (i.e. no escapes) */
1606 struct {
1607 int charsize;
1608 } state = {
1609 charsize
1610 };
1611 while (len-- > 0) {
1612 if (SRE_CHARGET((&state), ptr, 0) == '\\')
1613 return 0;
1614 ptr += charsize;
1615 }
1616 return 1;
1617}
1618
Guido van Rossumb700df92000-03-31 14:59:30 +00001619static PyObject *
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00001620sre_codesize(PyObject* self, PyObject *unused)
Guido van Rossumb700df92000-03-31 14:59:30 +00001621{
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001622 return PyLong_FromSize_t(sizeof(SRE_CODE));
Guido van Rossumb700df92000-03-31 14:59:30 +00001623}
1624
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001625static PyObject *
Fredrik Lundhb389df32000-06-29 12:48:37 +00001626sre_getlower(PyObject* self, PyObject* args)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001627{
1628 int character, flags;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001629 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001630 return NULL;
1631 if (flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001632 return Py_BuildValue("i", sre_lower_locale(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001633 if (flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001634 return Py_BuildValue("i", sre_lower_unicode(character));
Fredrik Lundhb389df32000-06-29 12:48:37 +00001635 return Py_BuildValue("i", sre_lower(character));
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001636}
1637
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001638LOCAL(void)
1639state_reset(SRE_STATE* state)
1640{
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001641 /* FIXME: dynamic! */
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001642 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001643
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001644 state->lastmark = -1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001645 state->lastindex = -1;
1646
1647 state->repeat = NULL;
1648
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001649 data_stack_dealloc(state);
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001650}
1651
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001652static void*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653getstring(PyObject* string, Py_ssize_t* p_length,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001654 int* p_logical_charsize, int* p_charsize,
1655 Py_buffer *view)
Guido van Rossumb700df92000-03-31 14:59:30 +00001656{
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001657 /* given a python object, return a data pointer, a length (in
1658 characters), and a character size. return NULL if the object
1659 is not a string (or not compatible) */
Tim Peters3d563502006-01-21 02:47:53 +00001660
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001661 PyBufferProcs *buffer;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001662 Py_ssize_t size, bytes;
1663 int charsize;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001664 void* ptr;
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00001665
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001666 /* Unicode objects do not support the buffer API. So, get the data
1667 directly instead. */
1668 if (PyUnicode_Check(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 if (PyUnicode_READY(string) == -1)
1670 return NULL;
1671 ptr = PyUnicode_DATA(string);
1672 *p_length = PyUnicode_GET_LENGTH(string);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001673 *p_charsize = PyUnicode_KIND(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 *p_logical_charsize = 4;
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00001675 return ptr;
1676 }
1677
Victor Stinner0058b862011-09-29 03:27:47 +02001678 /* get pointer to byte string buffer */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001679 view->len = -1;
Christian Heimes90aa7642007-12-19 02:45:37 +00001680 buffer = Py_TYPE(string)->tp_as_buffer;
Antoine Pitroufd036452008-08-19 17:56:33 +00001681 if (!buffer || !buffer->bf_getbuffer ||
Benjamin Petersone48944b2012-03-07 14:50:25 -06001682 (*buffer->bf_getbuffer)(string, view, PyBUF_SIMPLE) < 0) {
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001683 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1684 return NULL;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001685 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001686
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001687 /* determine buffer size */
Benjamin Petersone48944b2012-03-07 14:50:25 -06001688 bytes = view->len;
1689 ptr = view->buf;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001690
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001691 if (bytes < 0) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001692 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001693 goto err;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001694 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001695
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001696 /* determine character size */
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001697 size = PyObject_Size(string);
Guido van Rossumb700df92000-03-31 14:59:30 +00001698
Christian Heimes72b710a2008-05-26 13:28:38 +00001699 if (PyBytes_Check(string) || bytes == size)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001700 charsize = 1;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001701 else {
1702 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001703 goto err;
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00001704 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001705
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001706 *p_length = size;
1707 *p_charsize = charsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708 *p_logical_charsize = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001709
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001710 if (ptr == NULL) {
Antoine Pitroufd036452008-08-19 17:56:33 +00001711 PyErr_SetString(PyExc_ValueError,
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001712 "Buffer is NULL");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001713 goto err;
Travis E. Oliphantb99f7622007-08-18 11:21:56 +00001714 }
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001715 return ptr;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001716 err:
1717 PyBuffer_Release(view);
1718 view->buf = NULL;
1719 return NULL;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001720}
1721
1722LOCAL(PyObject*)
1723state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001724 Py_ssize_t start, Py_ssize_t end)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001725{
1726 /* prepare state object */
1727
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001728 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 int logical_charsize, charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001730 void* ptr;
1731
1732 memset(state, 0, sizeof(SRE_STATE));
1733
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001734 state->lastmark = -1;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001735 state->lastindex = -1;
1736
Benjamin Petersone48944b2012-03-07 14:50:25 -06001737 state->buffer.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001738 ptr = getstring(string, &length, &logical_charsize, &charsize, &state->buffer);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001739 if (!ptr)
Benjamin Petersone48944b2012-03-07 14:50:25 -06001740 goto err;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001741
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001742 if (logical_charsize == 1 && pattern->logical_charsize > 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001743 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001744 "can't use a string pattern on a bytes-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001745 goto err;
1746 }
Benjamin Peterson33d21a22012-03-07 14:59:13 -06001747 if (logical_charsize > 1 && pattern->logical_charsize == 1) {
Benjamin Petersone48944b2012-03-07 14:50:25 -06001748 PyErr_SetString(PyExc_TypeError,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03001749 "can't use a bytes pattern on a string-like object");
Benjamin Petersone48944b2012-03-07 14:50:25 -06001750 goto err;
1751 }
Antoine Pitroufd036452008-08-19 17:56:33 +00001752
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001753 /* adjust boundaries */
1754 if (start < 0)
1755 start = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001756 else if (start > length)
1757 start = length;
Guido van Rossumb700df92000-03-31 14:59:30 +00001758
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001759 if (end < 0)
1760 end = 0;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001761 else if (end > length)
1762 end = length;
1763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 state->logical_charsize = logical_charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00001765 state->charsize = charsize;
Guido van Rossumb700df92000-03-31 14:59:30 +00001766
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001767 state->beginning = ptr;
Guido van Rossumb700df92000-03-31 14:59:30 +00001768
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001769 state->start = (void*) ((char*) ptr + start * state->charsize);
1770 state->end = (void*) ((char*) ptr + end * state->charsize);
1771
1772 Py_INCREF(string);
1773 state->string = string;
1774 state->pos = start;
1775 state->endpos = end;
Guido van Rossumb700df92000-03-31 14:59:30 +00001776
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001777 if (pattern->flags & SRE_FLAG_LOCALE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001778 state->lower = sre_lower_locale;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001779 else if (pattern->flags & SRE_FLAG_UNICODE)
Fredrik Lundhb389df32000-06-29 12:48:37 +00001780 state->lower = sre_lower_unicode;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001781 else
Fredrik Lundhb389df32000-06-29 12:48:37 +00001782 state->lower = sre_lower;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00001783
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001784 return string;
Benjamin Petersone48944b2012-03-07 14:50:25 -06001785 err:
1786 if (state->buffer.buf)
1787 PyBuffer_Release(&state->buffer);
1788 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001789}
1790
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001791LOCAL(void)
1792state_fini(SRE_STATE* state)
1793{
Benjamin Petersone48944b2012-03-07 14:50:25 -06001794 if (state->buffer.buf)
1795 PyBuffer_Release(&state->buffer);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001796 Py_XDECREF(state->string);
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001797 data_stack_dealloc(state);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001798}
1799
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001800/* calculate offset from start of string */
1801#define STATE_OFFSET(state, member)\
1802 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1803
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001804LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001805state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001806{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001807 Py_ssize_t i, j;
Fredrik Lundh58100642000-08-09 09:14:35 +00001808
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001809 index = (index - 1) * 2;
1810
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001811 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00001812 if (empty)
1813 /* want empty string */
1814 i = j = 0;
1815 else {
1816 Py_INCREF(Py_None);
1817 return Py_None;
1818 }
Fredrik Lundh58100642000-08-09 09:14:35 +00001819 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001820 i = STATE_OFFSET(state, state->mark[index]);
1821 j = STATE_OFFSET(state, state->mark[index+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001822 }
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001823
Fredrik Lundh58100642000-08-09 09:14:35 +00001824 return PySequence_GetSlice(string, i, j);
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001825}
1826
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001827static void
1828pattern_error(int status)
1829{
1830 switch (status) {
1831 case SRE_ERROR_RECURSION_LIMIT:
1832 PyErr_SetString(
1833 PyExc_RuntimeError,
1834 "maximum recursion limit exceeded"
1835 );
1836 break;
1837 case SRE_ERROR_MEMORY:
1838 PyErr_NoMemory();
1839 break;
Christian Heimes2380ac72008-01-09 00:17:24 +00001840 case SRE_ERROR_INTERRUPTED:
1841 /* An exception has already been raised, so let it fly */
1842 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00001843 default:
1844 /* other error codes indicate compiler/engine bugs */
1845 PyErr_SetString(
1846 PyExc_RuntimeError,
1847 "internal error in regular expression engine"
1848 );
1849 }
1850}
1851
Guido van Rossumb700df92000-03-31 14:59:30 +00001852static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00001853pattern_dealloc(PatternObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00001854{
Raymond Hettinger027bb632004-05-31 03:09:25 +00001855 if (self->weakreflist != NULL)
1856 PyObject_ClearWeakRefs((PyObject *) self);
Benjamin Petersone48944b2012-03-07 14:50:25 -06001857 if (self->view.buf)
1858 PyBuffer_Release(&self->view);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001859 Py_XDECREF(self->pattern);
1860 Py_XDECREF(self->groupindex);
Fredrik Lundh6f5cba62001-01-16 07:05:29 +00001861 Py_XDECREF(self->indexgroup);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001862 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00001863}
1864
1865static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001866pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001867{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001868 SRE_STATE state;
1869 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001870
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001871 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001872 Py_ssize_t start = 0;
1873 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001874 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001875 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001876 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001877 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001878
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001879 string = state_init(&state, self, string, start, end);
1880 if (!string)
1881 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001882
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001883 state.ptr = state.start;
1884
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001885 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 if (state.logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001888 status = sre_match(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001889 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00001890 status = sre_umatch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001891 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001892
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001893 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
Thomas Wouters89f507f2006-12-13 04:49:30 +00001894 if (PyErr_Occurred())
1895 return NULL;
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001896
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001897 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001898
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001899 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001900}
1901
1902static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00001903pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00001904{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001905 SRE_STATE state;
1906 int status;
Guido van Rossumb700df92000-03-31 14:59:30 +00001907
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001908 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001909 Py_ssize_t start = 0;
1910 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00001911 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001912 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00001913 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001914 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00001915
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001916 string = state_init(&state, self, string, start, end);
1917 if (!string)
1918 return NULL;
1919
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001920 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001923 status = sre_search(&state, PatternObject_GetCode(self));
1924 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001925 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001926 }
Guido van Rossumb700df92000-03-31 14:59:30 +00001927
Fredrik Lundh7898c3e2000-08-07 20:59:04 +00001928 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1929
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001930 state_fini(&state);
Guido van Rossumb700df92000-03-31 14:59:30 +00001931
Thomas Wouters89f507f2006-12-13 04:49:30 +00001932 if (PyErr_Occurred())
1933 return NULL;
1934
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00001935 return pattern_new_match(self, &state, status);
Guido van Rossumb700df92000-03-31 14:59:30 +00001936}
1937
1938static PyObject*
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001939call(char* module, char* function, PyObject* args)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001940{
1941 PyObject* name;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001942 PyObject* mod;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001943 PyObject* func;
1944 PyObject* result;
1945
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001946 if (!args)
1947 return NULL;
Neal Norwitzfe537132007-08-26 03:55:15 +00001948 name = PyUnicode_FromString(module);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001949 if (!name)
1950 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001951 mod = PyImport_Import(name);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001952 Py_DECREF(name);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001953 if (!mod)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001954 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001955 func = PyObject_GetAttrString(mod, function);
1956 Py_DECREF(mod);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001957 if (!func)
1958 return NULL;
1959 result = PyObject_CallObject(func, args);
1960 Py_DECREF(func);
1961 Py_DECREF(args);
1962 return result;
1963}
1964
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001965#ifdef USE_BUILTIN_COPY
1966static int
1967deepcopy(PyObject** object, PyObject* memo)
1968{
1969 PyObject* copy;
1970
1971 copy = call(
1972 "copy", "deepcopy",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00001973 PyTuple_Pack(2, *object, memo)
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00001974 );
1975 if (!copy)
1976 return 0;
1977
1978 Py_DECREF(*object);
1979 *object = copy;
1980
1981 return 1; /* success */
1982}
1983#endif
1984
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00001985static PyObject*
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001986join_list(PyObject* list, PyObject* string)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001987{
1988 /* join list elements */
1989
1990 PyObject* joiner;
1991#if PY_VERSION_HEX >= 0x01060000
1992 PyObject* function;
1993 PyObject* args;
1994#endif
1995 PyObject* result;
1996
Thomas Wouters1b7f8912007-09-19 03:06:30 +00001997 joiner = PySequence_GetSlice(string, 0, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00001998 if (!joiner)
1999 return NULL;
2000
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002001 if (PyList_GET_SIZE(list) == 0) {
2002 Py_DECREF(list);
2003 return joiner;
2004 }
2005
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002006#if PY_VERSION_HEX >= 0x01060000
2007 function = PyObject_GetAttrString(joiner, "join");
2008 if (!function) {
2009 Py_DECREF(joiner);
2010 return NULL;
2011 }
2012 args = PyTuple_New(1);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002013 if (!args) {
2014 Py_DECREF(function);
2015 Py_DECREF(joiner);
2016 return NULL;
2017 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002018 PyTuple_SET_ITEM(args, 0, list);
2019 result = PyObject_CallObject(function, args);
2020 Py_DECREF(args); /* also removes list */
2021 Py_DECREF(function);
2022#else
2023 result = call(
2024 "string", "join",
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002025 PyTuple_Pack(2, list, joiner)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002026 );
2027#endif
2028 Py_DECREF(joiner);
2029
2030 return result;
2031}
2032
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00002033static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00002034pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00002035{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002036 SRE_STATE state;
2037 PyObject* list;
2038 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002039 Py_ssize_t i, b, e;
Guido van Rossumb700df92000-03-31 14:59:30 +00002040
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002041 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002042 Py_ssize_t start = 0;
2043 Py_ssize_t end = PY_SSIZE_T_MAX;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002044 static char* kwlist[] = { "source", "pos", "endpos", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002045 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
Fredrik Lundh562586e2000-10-03 20:43:34 +00002046 &string, &start, &end))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002047 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002048
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002049 string = state_init(&state, self, string, start, end);
2050 if (!string)
2051 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00002052
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002053 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002054 if (!list) {
2055 state_fini(&state);
2056 return NULL;
2057 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002058
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002059 while (state.start <= state.end) {
Guido van Rossumb700df92000-03-31 14:59:30 +00002060
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002061 PyObject* item;
Tim Peters3d563502006-01-21 02:47:53 +00002062
Fredrik Lundhebc37b22000-10-28 19:30:41 +00002063 state_reset(&state);
2064
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002065 state.ptr = state.start;
2066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 if (state.logical_charsize == 1) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002068 status = sre_search(&state, PatternObject_GetCode(self));
2069 } else {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002070 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002071 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002072
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002073 if (PyErr_Occurred())
2074 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002075
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002076 if (status <= 0) {
Fredrik Lundh436c3d582000-06-29 08:58:44 +00002077 if (status == 0)
2078 break;
Fredrik Lundh96ab4652000-08-03 16:29:50 +00002079 pattern_error(status);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002080 goto error;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002081 }
Tim Peters3d563502006-01-21 02:47:53 +00002082
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002083 /* don't bother to build a match object */
2084 switch (self->groups) {
2085 case 0:
2086 b = STATE_OFFSET(&state, state.start);
2087 e = STATE_OFFSET(&state, state.ptr);
2088 item = PySequence_GetSlice(string, b, e);
2089 if (!item)
2090 goto error;
2091 break;
2092 case 1:
2093 item = state_getslice(&state, 1, string, 1);
2094 if (!item)
2095 goto error;
2096 break;
2097 default:
2098 item = PyTuple_New(self->groups);
2099 if (!item)
2100 goto error;
2101 for (i = 0; i < self->groups; i++) {
2102 PyObject* o = state_getslice(&state, i+1, string, 1);
2103 if (!o) {
2104 Py_DECREF(item);
2105 goto error;
2106 }
2107 PyTuple_SET_ITEM(item, i, o);
2108 }
2109 break;
2110 }
2111
2112 status = PyList_Append(list, item);
2113 Py_DECREF(item);
2114 if (status < 0)
2115 goto error;
2116
2117 if (state.ptr == state.start)
2118 state.start = (void*) ((char*) state.ptr + state.charsize);
2119 else
2120 state.start = state.ptr;
2121
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002122 }
Guido van Rossumb700df92000-03-31 14:59:30 +00002123
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002124 state_fini(&state);
2125 return list;
Guido van Rossumb700df92000-03-31 14:59:30 +00002126
2127error:
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002128 Py_DECREF(list);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002129 state_fini(&state);
2130 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002131
Guido van Rossumb700df92000-03-31 14:59:30 +00002132}
2133
Fredrik Lundh703ce812001-10-24 22:16:30 +00002134#if PY_VERSION_HEX >= 0x02020000
2135static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002136pattern_finditer(PatternObject* pattern, PyObject* args, PyObject* kw)
Fredrik Lundh703ce812001-10-24 22:16:30 +00002137{
2138 PyObject* scanner;
2139 PyObject* search;
2140 PyObject* iterator;
2141
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002142 scanner = pattern_scanner(pattern, args, kw);
Fredrik Lundh703ce812001-10-24 22:16:30 +00002143 if (!scanner)
2144 return NULL;
2145
2146 search = PyObject_GetAttrString(scanner, "search");
2147 Py_DECREF(scanner);
2148 if (!search)
2149 return NULL;
2150
2151 iterator = PyCallIter_New(search, Py_None);
2152 Py_DECREF(search);
2153
2154 return iterator;
2155}
2156#endif
2157
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002158static PyObject*
2159pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2160{
2161 SRE_STATE state;
2162 PyObject* list;
2163 PyObject* item;
2164 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002165 Py_ssize_t n;
2166 Py_ssize_t i;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002167 void* last;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002168
2169 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002170 Py_ssize_t maxsplit = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002171 static char* kwlist[] = { "source", "maxsplit", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002172 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002173 &string, &maxsplit))
2174 return NULL;
2175
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002176 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002177 if (!string)
2178 return NULL;
2179
2180 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002181 if (!list) {
2182 state_fini(&state);
2183 return NULL;
2184 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002185
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002186 n = 0;
2187 last = state.start;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002188
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002189 while (!maxsplit || n < maxsplit) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002190
2191 state_reset(&state);
2192
2193 state.ptr = state.start;
2194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 if (state.logical_charsize == 1) {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002196 status = sre_search(&state, PatternObject_GetCode(self));
2197 } else {
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002198 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002199 }
2200
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002201 if (PyErr_Occurred())
2202 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002203
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002204 if (status <= 0) {
2205 if (status == 0)
2206 break;
2207 pattern_error(status);
2208 goto error;
2209 }
Tim Peters3d563502006-01-21 02:47:53 +00002210
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002211 if (state.start == state.ptr) {
2212 if (last == state.end)
2213 break;
2214 /* skip one character */
2215 state.start = (void*) ((char*) state.ptr + state.charsize);
2216 continue;
2217 }
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002218
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002219 /* get segment before this match */
2220 item = PySequence_GetSlice(
2221 string, STATE_OFFSET(&state, last),
2222 STATE_OFFSET(&state, state.start)
2223 );
2224 if (!item)
2225 goto error;
2226 status = PyList_Append(list, item);
2227 Py_DECREF(item);
2228 if (status < 0)
2229 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002230
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002231 /* add groups (if any) */
2232 for (i = 0; i < self->groups; i++) {
2233 item = state_getslice(&state, i+1, string, 0);
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002234 if (!item)
2235 goto error;
2236 status = PyList_Append(list, item);
2237 Py_DECREF(item);
2238 if (status < 0)
2239 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002240 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002241
2242 n = n + 1;
2243
2244 last = state.start = state.ptr;
2245
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002246 }
2247
Fredrik Lundhf864aa82001-10-22 06:01:56 +00002248 /* get segment following last match (even if empty) */
2249 item = PySequence_GetSlice(
2250 string, STATE_OFFSET(&state, last), state.endpos
2251 );
2252 if (!item)
2253 goto error;
2254 status = PyList_Append(list, item);
2255 Py_DECREF(item);
2256 if (status < 0)
2257 goto error;
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002258
2259 state_fini(&state);
2260 return list;
2261
2262error:
2263 Py_DECREF(list);
2264 state_fini(&state);
2265 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002266
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002267}
Fredrik Lundh971e78b2001-10-20 17:48:46 +00002268
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002269static PyObject*
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002270pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002271 Py_ssize_t count, Py_ssize_t subn)
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002272{
2273 SRE_STATE state;
2274 PyObject* list;
2275 PyObject* item;
2276 PyObject* filter;
2277 PyObject* args;
2278 PyObject* match;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002279 void* ptr;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002280 int status;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002281 Py_ssize_t n;
2282 Py_ssize_t i, b, e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 int logical_charsize, charsize;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002284 int filter_is_callable;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002285 Py_buffer view;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002286
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002287 if (PyCallable_Check(ptemplate)) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002288 /* sub/subn takes either a function or a template */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002289 filter = ptemplate;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002290 Py_INCREF(filter);
2291 filter_is_callable = 1;
2292 } else {
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002293 /* if not callable, check if it's a literal string */
2294 int literal;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002295 view.buf = NULL;
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002296 ptr = getstring(ptemplate, &n, &logical_charsize, &charsize, &view);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 b = charsize;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002298 if (ptr) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299 literal = sre_literal_template(b, ptr, n);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002300 } else {
2301 PyErr_Clear();
2302 literal = 0;
2303 }
Benjamin Petersone48944b2012-03-07 14:50:25 -06002304 if (view.buf)
2305 PyBuffer_Release(&view);
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002306 if (literal) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002307 filter = ptemplate;
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002308 Py_INCREF(filter);
2309 filter_is_callable = 0;
2310 } else {
2311 /* not a literal; hand it over to the template compiler */
2312 filter = call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00002313 SRE_PY_MODULE, "_subx",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002314 PyTuple_Pack(2, self, ptemplate)
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002315 );
2316 if (!filter)
2317 return NULL;
2318 filter_is_callable = PyCallable_Check(filter);
2319 }
Fredrik Lundhdac58492001-10-21 21:48:30 +00002320 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002321
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002322 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002323 if (!string) {
2324 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002325 return NULL;
Fredrik Lundh82b23072001-12-09 16:13:15 +00002326 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002327
2328 list = PyList_New(0);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002329 if (!list) {
Fredrik Lundh82b23072001-12-09 16:13:15 +00002330 Py_DECREF(filter);
Fredrik Lundh1296a8d2001-10-21 18:04:11 +00002331 state_fini(&state);
2332 return NULL;
2333 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002334
2335 n = i = 0;
2336
2337 while (!count || n < count) {
2338
2339 state_reset(&state);
2340
2341 state.ptr = state.start;
2342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 if (state.logical_charsize == 1) {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002344 status = sre_search(&state, PatternObject_GetCode(self));
2345 } else {
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002346 status = sre_usearch(&state, PatternObject_GetCode(self));
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002347 }
2348
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002349 if (PyErr_Occurred())
2350 goto error;
Thomas Wouters89f507f2006-12-13 04:49:30 +00002351
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002352 if (status <= 0) {
2353 if (status == 0)
2354 break;
2355 pattern_error(status);
2356 goto error;
2357 }
Tim Peters3d563502006-01-21 02:47:53 +00002358
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002359 b = STATE_OFFSET(&state, state.start);
2360 e = STATE_OFFSET(&state, state.ptr);
2361
2362 if (i < b) {
2363 /* get segment before this match */
2364 item = PySequence_GetSlice(string, i, b);
2365 if (!item)
2366 goto error;
2367 status = PyList_Append(list, item);
2368 Py_DECREF(item);
2369 if (status < 0)
2370 goto error;
2371
2372 } else if (i == b && i == e && n > 0)
2373 /* ignore empty match on latest position */
2374 goto next;
2375
2376 if (filter_is_callable) {
Fredrik Lundhdac58492001-10-21 21:48:30 +00002377 /* pass match object through filter */
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002378 match = pattern_new_match(self, &state, 1);
2379 if (!match)
2380 goto error;
Raymond Hettinger8ae46892003-10-12 19:09:37 +00002381 args = PyTuple_Pack(1, match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002382 if (!args) {
Guido van Rossum4e173842001-12-07 04:25:10 +00002383 Py_DECREF(match);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002384 goto error;
2385 }
2386 item = PyObject_CallObject(filter, args);
2387 Py_DECREF(args);
2388 Py_DECREF(match);
2389 if (!item)
2390 goto error;
2391 } else {
2392 /* filter is literal string */
2393 item = filter;
Fredrik Lundhdac58492001-10-21 21:48:30 +00002394 Py_INCREF(item);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002395 }
2396
2397 /* add to list */
Fredrik Lundh6de22ef2001-10-22 21:18:08 +00002398 if (item != Py_None) {
2399 status = PyList_Append(list, item);
2400 Py_DECREF(item);
2401 if (status < 0)
2402 goto error;
2403 }
Tim Peters3d563502006-01-21 02:47:53 +00002404
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002405 i = e;
2406 n = n + 1;
2407
2408next:
2409 /* move on */
2410 if (state.ptr == state.start)
2411 state.start = (void*) ((char*) state.ptr + state.charsize);
2412 else
2413 state.start = state.ptr;
2414
2415 }
2416
2417 /* get segment following last match */
Fredrik Lundhdac58492001-10-21 21:48:30 +00002418 if (i < state.endpos) {
2419 item = PySequence_GetSlice(string, i, state.endpos);
2420 if (!item)
2421 goto error;
2422 status = PyList_Append(list, item);
2423 Py_DECREF(item);
2424 if (status < 0)
2425 goto error;
2426 }
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002427
2428 state_fini(&state);
2429
Guido van Rossum4e173842001-12-07 04:25:10 +00002430 Py_DECREF(filter);
2431
Fredrik Lundhdac58492001-10-21 21:48:30 +00002432 /* convert list to single string (also removes list) */
Thomas Wouters1b7f8912007-09-19 03:06:30 +00002433 item = join_list(list, string);
Fredrik Lundhdac58492001-10-21 21:48:30 +00002434
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002435 if (!item)
2436 return NULL;
2437
2438 if (subn)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01002439 return Py_BuildValue("Nn", item, n);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002440
2441 return item;
2442
2443error:
2444 Py_DECREF(list);
2445 state_fini(&state);
Fredrik Lundh82b23072001-12-09 16:13:15 +00002446 Py_DECREF(filter);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002447 return NULL;
Tim Peters3d563502006-01-21 02:47:53 +00002448
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002449}
2450
2451static PyObject*
2452pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2453{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002454 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002455 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002456 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002457 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002458 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002459 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002460 return NULL;
2461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002462 return pattern_subx(self, ptemplate, string, count, 0);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002463}
2464
2465static PyObject*
2466pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2467{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002468 PyObject* ptemplate;
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002469 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002470 Py_ssize_t count = 0;
Martin v. Löwis15e62742006-02-27 16:46:16 +00002471 static char* kwlist[] = { "repl", "string", "count", NULL };
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002472 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002473 &ptemplate, &string, &count))
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002474 return NULL;
2475
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002476 return pattern_subx(self, ptemplate, string, count, 1);
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002477}
Fredrik Lundhbec95b92001-10-21 16:47:57 +00002478
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002479static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002480pattern_copy(PatternObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002481{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002482#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002483 PatternObject* copy;
2484 int offset;
2485
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002486 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2487 if (!copy)
2488 return NULL;
2489
2490 offset = offsetof(PatternObject, groups);
2491
2492 Py_XINCREF(self->groupindex);
2493 Py_XINCREF(self->indexgroup);
2494 Py_XINCREF(self->pattern);
2495
2496 memcpy((char*) copy + offset, (char*) self + offset,
2497 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
Raymond Hettinger027bb632004-05-31 03:09:25 +00002498 copy->weakreflist = NULL;
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002499
2500 return (PyObject*) copy;
2501#else
2502 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2503 return NULL;
2504#endif
2505}
2506
2507static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002508pattern_deepcopy(PatternObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002509{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002510#ifdef USE_BUILTIN_COPY
2511 PatternObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00002512
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002513 copy = (PatternObject*) pattern_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002514 if (!copy)
2515 return NULL;
2516
2517 if (!deepcopy(&copy->groupindex, memo) ||
2518 !deepcopy(&copy->indexgroup, memo) ||
2519 !deepcopy(&copy->pattern, memo)) {
2520 Py_DECREF(copy);
2521 return NULL;
2522 }
2523
2524#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002525 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2526 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00002527#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00002528}
2529
Raymond Hettinger94478742004-09-24 04:31:19 +00002530PyDoc_STRVAR(pattern_match_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002531"match(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002532 Matches zero or more characters at the beginning of the string");
2533
2534PyDoc_STRVAR(pattern_search_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002535"search(string[, pos[, endpos]]) -> match object or None.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002536 Scan through string looking for a match, and return a corresponding\n\
Andrew Svetlov0b64c142012-12-25 18:48:54 +02002537 match object instance. Return None if no position in the string matches.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002538
2539PyDoc_STRVAR(pattern_split_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002540"split(string[, maxsplit = 0]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002541 Split string by the occurrences of pattern.");
2542
2543PyDoc_STRVAR(pattern_findall_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002544"findall(string[, pos[, endpos]]) -> list.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002545 Return a list of all non-overlapping matches of pattern in string.");
2546
2547PyDoc_STRVAR(pattern_finditer_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002548"finditer(string[, pos[, endpos]]) -> iterator.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002549 Return an iterator over all non-overlapping matches for the \n\
2550 RE pattern in string. For each match, the iterator returns a\n\
2551 match object.");
2552
2553PyDoc_STRVAR(pattern_sub_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002554"sub(repl, string[, count = 0]) -> newstring.\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002555 Return the string obtained by replacing the leftmost non-overlapping\n\
Tim Peters3d563502006-01-21 02:47:53 +00002556 occurrences of pattern in string by the replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002557
2558PyDoc_STRVAR(pattern_subn_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02002559"subn(repl, string[, count = 0]) -> (newstring, number of subs)\n\
Raymond Hettinger94478742004-09-24 04:31:19 +00002560 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2561 the leftmost non-overlapping occurrences of pattern with the\n\
Tim Peters3d563502006-01-21 02:47:53 +00002562 replacement repl.");
Raymond Hettinger94478742004-09-24 04:31:19 +00002563
2564PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2565
Fredrik Lundh75f2d672000-06-29 11:34:28 +00002566static PyMethodDef pattern_methods[] = {
Tim Peters3d563502006-01-21 02:47:53 +00002567 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002568 pattern_match_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002569 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002570 pattern_search_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002571 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002572 pattern_sub_doc},
Raymond Hettinger94478742004-09-24 04:31:19 +00002573 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002574 pattern_subn_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002575 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002576 pattern_split_doc},
Tim Peters3d563502006-01-21 02:47:53 +00002577 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002578 pattern_findall_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002579#if PY_VERSION_HEX >= 0x02020000
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002580 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS|METH_KEYWORDS,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002581 pattern_finditer_doc},
Fredrik Lundh703ce812001-10-24 22:16:30 +00002582#endif
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06002583 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS|METH_KEYWORDS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00002584 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2585 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00002586 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00002587};
2588
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00002589#define PAT_OFF(x) offsetof(PatternObject, x)
2590static PyMemberDef pattern_members[] = {
2591 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2592 {"flags", T_INT, PAT_OFF(flags), READONLY},
2593 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2594 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2595 {NULL} /* Sentinel */
2596};
Guido van Rossumb700df92000-03-31 14:59:30 +00002597
Neal Norwitz57c179c2006-03-22 07:18:02 +00002598static PyTypeObject Pattern_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00002599 PyVarObject_HEAD_INIT(NULL, 0)
2600 "_" SRE_MODULE ".SRE_Pattern",
Fredrik Lundh6f013982000-07-03 18:44:21 +00002601 sizeof(PatternObject), sizeof(SRE_CODE),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002602 (destructor)pattern_dealloc, /* tp_dealloc */
2603 0, /* tp_print */
2604 0, /* tp_getattr */
2605 0, /* tp_setattr */
2606 0, /* tp_reserved */
2607 0, /* tp_repr */
2608 0, /* tp_as_number */
2609 0, /* tp_as_sequence */
2610 0, /* tp_as_mapping */
2611 0, /* tp_hash */
2612 0, /* tp_call */
2613 0, /* tp_str */
2614 0, /* tp_getattro */
2615 0, /* tp_setattro */
2616 0, /* tp_as_buffer */
2617 Py_TPFLAGS_DEFAULT, /* tp_flags */
2618 pattern_doc, /* tp_doc */
2619 0, /* tp_traverse */
2620 0, /* tp_clear */
2621 0, /* tp_richcompare */
2622 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2623 0, /* tp_iter */
2624 0, /* tp_iternext */
2625 pattern_methods, /* tp_methods */
2626 pattern_members, /* tp_members */
Guido van Rossumb700df92000-03-31 14:59:30 +00002627};
2628
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002629static int _validate(PatternObject *self); /* Forward */
2630
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002631static PyObject *
2632_compile(PyObject* self_, PyObject* args)
2633{
2634 /* "compile" pattern descriptor to pattern object */
2635
2636 PatternObject* self;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002637 Py_ssize_t i, n;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002638
2639 PyObject* pattern;
2640 int flags = 0;
2641 PyObject* code;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002642 Py_ssize_t groups = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002643 PyObject* groupindex = NULL;
2644 PyObject* indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002645
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002646 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002647 &PyList_Type, &code, &groups,
2648 &groupindex, &indexgroup))
2649 return NULL;
2650
2651 n = PyList_GET_SIZE(code);
Christian Heimes587c2bf2008-01-19 16:21:02 +00002652 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002653 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2654 if (!self)
2655 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002656 self->weakreflist = NULL;
2657 self->pattern = NULL;
2658 self->groupindex = NULL;
2659 self->indexgroup = NULL;
Benjamin Petersone48944b2012-03-07 14:50:25 -06002660 self->view.buf = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002661
2662 self->codesize = n;
2663
2664 for (i = 0; i < n; i++) {
2665 PyObject *o = PyList_GET_ITEM(code, i);
Guido van Rossumddefaf32007-01-14 03:31:43 +00002666 unsigned long value = PyLong_AsUnsignedLong(o);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002667 self->code[i] = (SRE_CODE) value;
2668 if ((unsigned long) self->code[i] != value) {
2669 PyErr_SetString(PyExc_OverflowError,
2670 "regular expression code size limit exceeded");
2671 break;
2672 }
2673 }
2674
2675 if (PyErr_Occurred()) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00002676 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002677 return NULL;
2678 }
2679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (pattern == Py_None) {
2681 self->logical_charsize = -1;
2682 self->charsize = -1;
Victor Stinner63ab8752011-11-22 03:31:20 +01002683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 else {
2685 Py_ssize_t p_length;
2686 if (!getstring(pattern, &p_length, &self->logical_charsize,
Benjamin Peterson33d21a22012-03-07 14:59:13 -06002687 &self->charsize, &self->view)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 Py_DECREF(self);
2689 return NULL;
2690 }
2691 }
Antoine Pitroufd036452008-08-19 17:56:33 +00002692
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002693 Py_INCREF(pattern);
2694 self->pattern = pattern;
2695
2696 self->flags = flags;
2697
2698 self->groups = groups;
2699
2700 Py_XINCREF(groupindex);
2701 self->groupindex = groupindex;
2702
2703 Py_XINCREF(indexgroup);
2704 self->indexgroup = indexgroup;
2705
2706 self->weakreflist = NULL;
2707
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002708 if (!_validate(self)) {
2709 Py_DECREF(self);
2710 return NULL;
2711 }
2712
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002713 return (PyObject*) self;
2714}
2715
Guido van Rossumb700df92000-03-31 14:59:30 +00002716/* -------------------------------------------------------------------- */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002717/* Code validation */
2718
2719/* To learn more about this code, have a look at the _compile() function in
2720 Lib/sre_compile.py. The validation functions below checks the code array
2721 for conformance with the code patterns generated there.
2722
2723 The nice thing about the generated code is that it is position-independent:
2724 all jumps are relative jumps forward. Also, jumps don't cross each other:
2725 the target of a later jump is always earlier than the target of an earlier
2726 jump. IOW, this is okay:
2727
2728 J---------J-------T--------T
2729 \ \_____/ /
2730 \______________________/
2731
2732 but this is not:
2733
2734 J---------J-------T--------T
2735 \_________\_____/ /
2736 \____________/
2737
2738 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2739 bytes wide (the latter if Python is compiled for "wide" unicode support).
2740*/
2741
2742/* Defining this one enables tracing of the validator */
2743#undef VVERBOSE
2744
2745/* Trace macro for the validator */
2746#if defined(VVERBOSE)
2747#define VTRACE(v) printf v
2748#else
Senthil Kumaran202a3c42011-10-20 02:15:36 +08002749#define VTRACE(v) do {} while(0) /* do nothing */
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002750#endif
2751
2752/* Report failure */
2753#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2754
2755/* Extract opcode, argument, or skip count from code array */
2756#define GET_OP \
2757 do { \
2758 VTRACE(("%p: ", code)); \
2759 if (code >= end) FAIL; \
2760 op = *code++; \
2761 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2762 } while (0)
2763#define GET_ARG \
2764 do { \
2765 VTRACE(("%p= ", code)); \
2766 if (code >= end) FAIL; \
2767 arg = *code++; \
2768 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2769 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002770#define GET_SKIP_ADJ(adj) \
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002771 do { \
2772 VTRACE(("%p= ", code)); \
2773 if (code >= end) FAIL; \
2774 skip = *code; \
2775 VTRACE(("%lu (skip to %p)\n", \
2776 (unsigned long)skip, code+skip)); \
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002777 if (code+skip-adj < code || code+skip-adj > end)\
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002778 FAIL; \
2779 code++; \
2780 } while (0)
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00002781#define GET_SKIP GET_SKIP_ADJ(0)
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002782
2783static int
2784_validate_charset(SRE_CODE *code, SRE_CODE *end)
2785{
2786 /* Some variables are manipulated by the macros above */
2787 SRE_CODE op;
2788 SRE_CODE arg;
2789 SRE_CODE offset;
2790 int i;
2791
2792 while (code < end) {
2793 GET_OP;
2794 switch (op) {
2795
2796 case SRE_OP_NEGATE:
2797 break;
2798
2799 case SRE_OP_LITERAL:
2800 GET_ARG;
2801 break;
2802
2803 case SRE_OP_RANGE:
2804 GET_ARG;
2805 GET_ARG;
2806 break;
2807
2808 case SRE_OP_CHARSET:
2809 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2810 if (code+offset < code || code+offset > end)
2811 FAIL;
2812 code += offset;
2813 break;
2814
2815 case SRE_OP_BIGCHARSET:
2816 GET_ARG; /* Number of blocks */
2817 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2818 if (code+offset < code || code+offset > end)
2819 FAIL;
2820 /* Make sure that each byte points to a valid block */
2821 for (i = 0; i < 256; i++) {
2822 if (((unsigned char *)code)[i] >= arg)
2823 FAIL;
2824 }
2825 code += offset;
2826 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2827 if (code+offset < code || code+offset > end)
2828 FAIL;
2829 code += offset;
2830 break;
2831
2832 case SRE_OP_CATEGORY:
2833 GET_ARG;
2834 switch (arg) {
2835 case SRE_CATEGORY_DIGIT:
2836 case SRE_CATEGORY_NOT_DIGIT:
2837 case SRE_CATEGORY_SPACE:
2838 case SRE_CATEGORY_NOT_SPACE:
2839 case SRE_CATEGORY_WORD:
2840 case SRE_CATEGORY_NOT_WORD:
2841 case SRE_CATEGORY_LINEBREAK:
2842 case SRE_CATEGORY_NOT_LINEBREAK:
2843 case SRE_CATEGORY_LOC_WORD:
2844 case SRE_CATEGORY_LOC_NOT_WORD:
2845 case SRE_CATEGORY_UNI_DIGIT:
2846 case SRE_CATEGORY_UNI_NOT_DIGIT:
2847 case SRE_CATEGORY_UNI_SPACE:
2848 case SRE_CATEGORY_UNI_NOT_SPACE:
2849 case SRE_CATEGORY_UNI_WORD:
2850 case SRE_CATEGORY_UNI_NOT_WORD:
2851 case SRE_CATEGORY_UNI_LINEBREAK:
2852 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2853 break;
2854 default:
2855 FAIL;
2856 }
2857 break;
2858
2859 default:
2860 FAIL;
2861
2862 }
2863 }
2864
2865 return 1;
2866}
2867
2868static int
2869_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2870{
2871 /* Some variables are manipulated by the macros above */
2872 SRE_CODE op;
2873 SRE_CODE arg;
2874 SRE_CODE skip;
2875
2876 VTRACE(("code=%p, end=%p\n", code, end));
2877
2878 if (code > end)
2879 FAIL;
2880
2881 while (code < end) {
2882 GET_OP;
2883 switch (op) {
2884
2885 case SRE_OP_MARK:
2886 /* We don't check whether marks are properly nested; the
2887 sre_match() code is robust even if they don't, and the worst
2888 you can get is nonsensical match results. */
2889 GET_ARG;
2890 if (arg > 2*groups+1) {
2891 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2892 FAIL;
2893 }
2894 break;
2895
2896 case SRE_OP_LITERAL:
2897 case SRE_OP_NOT_LITERAL:
2898 case SRE_OP_LITERAL_IGNORE:
2899 case SRE_OP_NOT_LITERAL_IGNORE:
2900 GET_ARG;
2901 /* The arg is just a character, nothing to check */
2902 break;
2903
2904 case SRE_OP_SUCCESS:
2905 case SRE_OP_FAILURE:
2906 /* Nothing to check; these normally end the matching process */
2907 break;
2908
2909 case SRE_OP_AT:
2910 GET_ARG;
2911 switch (arg) {
2912 case SRE_AT_BEGINNING:
2913 case SRE_AT_BEGINNING_STRING:
2914 case SRE_AT_BEGINNING_LINE:
2915 case SRE_AT_END:
2916 case SRE_AT_END_LINE:
2917 case SRE_AT_END_STRING:
2918 case SRE_AT_BOUNDARY:
2919 case SRE_AT_NON_BOUNDARY:
2920 case SRE_AT_LOC_BOUNDARY:
2921 case SRE_AT_LOC_NON_BOUNDARY:
2922 case SRE_AT_UNI_BOUNDARY:
2923 case SRE_AT_UNI_NON_BOUNDARY:
2924 break;
2925 default:
2926 FAIL;
2927 }
2928 break;
2929
2930 case SRE_OP_ANY:
2931 case SRE_OP_ANY_ALL:
2932 /* These have no operands */
2933 break;
2934
2935 case SRE_OP_IN:
2936 case SRE_OP_IN_IGNORE:
2937 GET_SKIP;
2938 /* Stop 1 before the end; we check the FAILURE below */
2939 if (!_validate_charset(code, code+skip-2))
2940 FAIL;
2941 if (code[skip-2] != SRE_OP_FAILURE)
2942 FAIL;
2943 code += skip-1;
2944 break;
2945
2946 case SRE_OP_INFO:
2947 {
2948 /* A minimal info field is
2949 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2950 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2951 more follows. */
Ross Lagerwall88748d72012-03-06 21:48:57 +02002952 SRE_CODE flags, i;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002953 SRE_CODE *newcode;
2954 GET_SKIP;
2955 newcode = code+skip-1;
2956 GET_ARG; flags = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002957 GET_ARG;
2958 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002959 /* Check that only valid flags are present */
2960 if ((flags & ~(SRE_INFO_PREFIX |
2961 SRE_INFO_LITERAL |
2962 SRE_INFO_CHARSET)) != 0)
2963 FAIL;
2964 /* PREFIX and CHARSET are mutually exclusive */
2965 if ((flags & SRE_INFO_PREFIX) &&
2966 (flags & SRE_INFO_CHARSET))
2967 FAIL;
2968 /* LITERAL implies PREFIX */
2969 if ((flags & SRE_INFO_LITERAL) &&
2970 !(flags & SRE_INFO_PREFIX))
2971 FAIL;
2972 /* Validate the prefix */
2973 if (flags & SRE_INFO_PREFIX) {
Ross Lagerwall88748d72012-03-06 21:48:57 +02002974 SRE_CODE prefix_len;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002975 GET_ARG; prefix_len = arg;
Ross Lagerwall88748d72012-03-06 21:48:57 +02002976 GET_ARG;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00002977 /* Here comes the prefix string */
2978 if (code+prefix_len < code || code+prefix_len > newcode)
2979 FAIL;
2980 code += prefix_len;
2981 /* And here comes the overlap table */
2982 if (code+prefix_len < code || code+prefix_len > newcode)
2983 FAIL;
2984 /* Each overlap value should be < prefix_len */
2985 for (i = 0; i < prefix_len; i++) {
2986 if (code[i] >= prefix_len)
2987 FAIL;
2988 }
2989 code += prefix_len;
2990 }
2991 /* Validate the charset */
2992 if (flags & SRE_INFO_CHARSET) {
2993 if (!_validate_charset(code, newcode-1))
2994 FAIL;
2995 if (newcode[-1] != SRE_OP_FAILURE)
2996 FAIL;
2997 code = newcode;
2998 }
2999 else if (code != newcode) {
3000 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3001 FAIL;
3002 }
3003 }
3004 break;
3005
3006 case SRE_OP_BRANCH:
3007 {
3008 SRE_CODE *target = NULL;
3009 for (;;) {
3010 GET_SKIP;
3011 if (skip == 0)
3012 break;
3013 /* Stop 2 before the end; we check the JUMP below */
3014 if (!_validate_inner(code, code+skip-3, groups))
3015 FAIL;
3016 code += skip-3;
3017 /* Check that it ends with a JUMP, and that each JUMP
3018 has the same target */
3019 GET_OP;
3020 if (op != SRE_OP_JUMP)
3021 FAIL;
3022 GET_SKIP;
3023 if (target == NULL)
3024 target = code+skip-1;
3025 else if (code+skip-1 != target)
3026 FAIL;
3027 }
3028 }
3029 break;
3030
3031 case SRE_OP_REPEAT_ONE:
3032 case SRE_OP_MIN_REPEAT_ONE:
3033 {
3034 SRE_CODE min, max;
3035 GET_SKIP;
3036 GET_ARG; min = arg;
3037 GET_ARG; max = arg;
3038 if (min > max)
3039 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003040 if (max > 65535)
3041 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003042 if (!_validate_inner(code, code+skip-4, groups))
3043 FAIL;
3044 code += skip-4;
3045 GET_OP;
3046 if (op != SRE_OP_SUCCESS)
3047 FAIL;
3048 }
3049 break;
3050
3051 case SRE_OP_REPEAT:
3052 {
3053 SRE_CODE min, max;
3054 GET_SKIP;
3055 GET_ARG; min = arg;
3056 GET_ARG; max = arg;
3057 if (min > max)
3058 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003059 if (max > 65535)
3060 FAIL;
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003061 if (!_validate_inner(code, code+skip-3, groups))
3062 FAIL;
3063 code += skip-3;
3064 GET_OP;
3065 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3066 FAIL;
3067 }
3068 break;
3069
3070 case SRE_OP_GROUPREF:
3071 case SRE_OP_GROUPREF_IGNORE:
3072 GET_ARG;
3073 if (arg >= groups)
3074 FAIL;
3075 break;
3076
3077 case SRE_OP_GROUPREF_EXISTS:
3078 /* The regex syntax for this is: '(?(group)then|else)', where
3079 'group' is either an integer group number or a group name,
3080 'then' and 'else' are sub-regexes, and 'else' is optional. */
3081 GET_ARG;
3082 if (arg >= groups)
3083 FAIL;
Guido van Rossum92f8f3e2008-09-10 14:30:50 +00003084 GET_SKIP_ADJ(1);
Guido van Rossum10faf6a2008-08-06 19:29:14 +00003085 code--; /* The skip is relative to the first arg! */
3086 /* There are two possibilities here: if there is both a 'then'
3087 part and an 'else' part, the generated code looks like:
3088
3089 GROUPREF_EXISTS
3090 <group>
3091 <skipyes>
3092 ...then part...
3093 JUMP
3094 <skipno>
3095 (<skipyes> jumps here)
3096 ...else part...
3097 (<skipno> jumps here)
3098
3099 If there is only a 'then' part, it looks like:
3100
3101 GROUPREF_EXISTS
3102 <group>
3103 <skip>
3104 ...then part...
3105 (<skip> jumps here)
3106
3107 There is no direct way to decide which it is, and we don't want
3108 to allow arbitrary jumps anywhere in the code; so we just look
3109 for a JUMP opcode preceding our skip target.
3110 */
3111 if (skip >= 3 && code+skip-3 >= code &&
3112 code[skip-3] == SRE_OP_JUMP)
3113 {
3114 VTRACE(("both then and else parts present\n"));
3115 if (!_validate_inner(code+1, code+skip-3, groups))
3116 FAIL;
3117 code += skip-2; /* Position after JUMP, at <skipno> */
3118 GET_SKIP;
3119 if (!_validate_inner(code, code+skip-1, groups))
3120 FAIL;
3121 code += skip-1;
3122 }
3123 else {
3124 VTRACE(("only a then part present\n"));
3125 if (!_validate_inner(code+1, code+skip-1, groups))
3126 FAIL;
3127 code += skip-1;
3128 }
3129 break;
3130
3131 case SRE_OP_ASSERT:
3132 case SRE_OP_ASSERT_NOT:
3133 GET_SKIP;
3134 GET_ARG; /* 0 for lookahead, width for lookbehind */
3135 code--; /* Back up over arg to simplify math below */
3136 if (arg & 0x80000000)
3137 FAIL; /* Width too large */
3138 /* Stop 1 before the end; we check the SUCCESS below */
3139 if (!_validate_inner(code+1, code+skip-2, groups))
3140 FAIL;
3141 code += skip-2;
3142 GET_OP;
3143 if (op != SRE_OP_SUCCESS)
3144 FAIL;
3145 break;
3146
3147 default:
3148 FAIL;
3149
3150 }
3151 }
3152
3153 VTRACE(("okay\n"));
3154 return 1;
3155}
3156
3157static int
3158_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3159{
3160 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3161 FAIL;
3162 if (groups == 0) /* fix for simplejson */
3163 groups = 100; /* 100 groups should always be safe */
3164 return _validate_inner(code, end-1, groups);
3165}
3166
3167static int
3168_validate(PatternObject *self)
3169{
3170 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3171 {
3172 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3173 return 0;
3174 }
3175 else
3176 VTRACE(("Success!\n"));
3177 return 1;
3178}
3179
3180/* -------------------------------------------------------------------- */
Guido van Rossumb700df92000-03-31 14:59:30 +00003181/* match methods */
3182
3183static void
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003184match_dealloc(MatchObject* self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003185{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003186 Py_XDECREF(self->regs);
3187 Py_XDECREF(self->string);
3188 Py_DECREF(self->pattern);
3189 PyObject_DEL(self);
Guido van Rossumb700df92000-03-31 14:59:30 +00003190}
3191
3192static PyObject*
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003193match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
Guido van Rossumb700df92000-03-31 14:59:30 +00003194{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003195 if (index < 0 || index >= self->groups) {
3196 /* raise IndexError if we were given a bad group number */
3197 PyErr_SetString(
3198 PyExc_IndexError,
3199 "no such group"
3200 );
3201 return NULL;
3202 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003203
Fredrik Lundh6f013982000-07-03 18:44:21 +00003204 index *= 2;
3205
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003206 if (self->string == Py_None || self->mark[index] < 0) {
3207 /* return default value if the string or group is undefined */
3208 Py_INCREF(def);
3209 return def;
3210 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003211
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003212 return PySequence_GetSlice(
3213 self->string, self->mark[index], self->mark[index+1]
3214 );
Guido van Rossumb700df92000-03-31 14:59:30 +00003215}
3216
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003217static Py_ssize_t
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003218match_getindex(MatchObject* self, PyObject* index)
Guido van Rossumb700df92000-03-31 14:59:30 +00003219{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003220 Py_ssize_t i;
Guido van Rossumb700df92000-03-31 14:59:30 +00003221
Guido van Rossumddefaf32007-01-14 03:31:43 +00003222 if (index == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003223 /* Default value */
3224 return 0;
Guido van Rossumddefaf32007-01-14 03:31:43 +00003225
Christian Heimes217cfd12007-12-02 14:31:20 +00003226 if (PyLong_Check(index))
3227 return PyLong_AsSsize_t(index);
Guido van Rossumb700df92000-03-31 14:59:30 +00003228
Fredrik Lundh6f013982000-07-03 18:44:21 +00003229 i = -1;
3230
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003231 if (self->pattern->groupindex) {
3232 index = PyObject_GetItem(self->pattern->groupindex, index);
3233 if (index) {
Neal Norwitz1fe5f382007-08-31 04:32:55 +00003234 if (PyLong_Check(index))
Christian Heimes217cfd12007-12-02 14:31:20 +00003235 i = PyLong_AsSsize_t(index);
Fredrik Lundh6f013982000-07-03 18:44:21 +00003236 Py_DECREF(index);
3237 } else
3238 PyErr_Clear();
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003239 }
Fredrik Lundh6f013982000-07-03 18:44:21 +00003240
3241 return i;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003242}
3243
3244static PyObject*
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003245match_getslice(MatchObject* self, PyObject* index, PyObject* def)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003246{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003247 return match_getslice_by_index(self, match_getindex(self, index), def);
Guido van Rossumb700df92000-03-31 14:59:30 +00003248}
3249
3250static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003251match_expand(MatchObject* self, PyObject* ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003252{
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003253 /* delegate to Python code */
3254 return call(
Thomas Wouters9ada3d62006-04-21 09:47:09 +00003255 SRE_PY_MODULE, "_expand",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003256 PyTuple_Pack(3, self->pattern, self, ptemplate)
Fredrik Lundh5644b7f2000-09-21 17:03:25 +00003257 );
3258}
3259
3260static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003261match_group(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003262{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003263 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003264 Py_ssize_t i, size;
Guido van Rossumb700df92000-03-31 14:59:30 +00003265
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003266 size = PyTuple_GET_SIZE(args);
Guido van Rossumb700df92000-03-31 14:59:30 +00003267
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003268 switch (size) {
3269 case 0:
3270 result = match_getslice(self, Py_False, Py_None);
3271 break;
3272 case 1:
3273 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3274 break;
3275 default:
3276 /* fetch multiple items */
3277 result = PyTuple_New(size);
3278 if (!result)
3279 return NULL;
3280 for (i = 0; i < size; i++) {
3281 PyObject* item = match_getslice(
Fredrik Lundhdf02d0b2000-06-30 07:08:20 +00003282 self, PyTuple_GET_ITEM(args, i), Py_None
3283 );
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003284 if (!item) {
3285 Py_DECREF(result);
3286 return NULL;
3287 }
3288 PyTuple_SET_ITEM(result, i, item);
3289 }
3290 break;
3291 }
3292 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003293}
3294
3295static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003296match_groups(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003297{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003298 PyObject* result;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003299 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003300
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003301 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003302 static char* kwlist[] = { "default", NULL };
Fredrik Lundh562586e2000-10-03 20:43:34 +00003303 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003304 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003305
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003306 result = PyTuple_New(self->groups-1);
3307 if (!result)
3308 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003309
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003310 for (index = 1; index < self->groups; index++) {
3311 PyObject* item;
3312 item = match_getslice_by_index(self, index, def);
3313 if (!item) {
3314 Py_DECREF(result);
3315 return NULL;
3316 }
3317 PyTuple_SET_ITEM(result, index-1, item);
3318 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003319
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003320 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003321}
3322
3323static PyObject*
Fredrik Lundh562586e2000-10-03 20:43:34 +00003324match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
Guido van Rossumb700df92000-03-31 14:59:30 +00003325{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003326 PyObject* result;
3327 PyObject* keys;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003328 Py_ssize_t index;
Guido van Rossumb700df92000-03-31 14:59:30 +00003329
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003330 PyObject* def = Py_None;
Martin v. Löwis15e62742006-02-27 16:46:16 +00003331 static char* kwlist[] = { "default", NULL };
Fredrik Lundh770617b2001-01-14 15:06:11 +00003332 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003333 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003334
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003335 result = PyDict_New();
3336 if (!result || !self->pattern->groupindex)
3337 return result;
Guido van Rossumb700df92000-03-31 14:59:30 +00003338
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003339 keys = PyMapping_Keys(self->pattern->groupindex);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003340 if (!keys)
3341 goto failed;
Guido van Rossumb700df92000-03-31 14:59:30 +00003342
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003343 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
Fredrik Lundh770617b2001-01-14 15:06:11 +00003344 int status;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003345 PyObject* key;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003346 PyObject* value;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003347 key = PyList_GET_ITEM(keys, index);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003348 if (!key)
3349 goto failed;
3350 value = match_getslice(self, key, def);
3351 if (!value) {
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003352 Py_DECREF(key);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003353 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003354 }
Fredrik Lundh770617b2001-01-14 15:06:11 +00003355 status = PyDict_SetItem(result, key, value);
3356 Py_DECREF(value);
3357 if (status < 0)
3358 goto failed;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003359 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003360
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003361 Py_DECREF(keys);
Guido van Rossumb700df92000-03-31 14:59:30 +00003362
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003363 return result;
Fredrik Lundh770617b2001-01-14 15:06:11 +00003364
3365failed:
Neal Norwitz60da3162006-03-07 04:48:24 +00003366 Py_XDECREF(keys);
Fredrik Lundh770617b2001-01-14 15:06:11 +00003367 Py_DECREF(result);
3368 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003369}
3370
3371static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003372match_start(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003373{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003374 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003375
Guido van Rossumddefaf32007-01-14 03:31:43 +00003376 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003377 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003378 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003379
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003380 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003381
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003382 if (index < 0 || index >= self->groups) {
3383 PyErr_SetString(
3384 PyExc_IndexError,
3385 "no such group"
3386 );
3387 return NULL;
3388 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003389
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003390 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003391 return PyLong_FromSsize_t(self->mark[index*2]);
Guido van Rossumb700df92000-03-31 14:59:30 +00003392}
3393
3394static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003395match_end(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003396{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003397 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003398
Guido van Rossumddefaf32007-01-14 03:31:43 +00003399 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003400 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003401 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003402
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003403 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003404
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003405 if (index < 0 || index >= self->groups) {
3406 PyErr_SetString(
3407 PyExc_IndexError,
3408 "no such group"
3409 );
3410 return NULL;
3411 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003412
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003413 /* mark is -1 if group is undefined */
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003414 return PyLong_FromSsize_t(self->mark[index*2+1]);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003415}
3416
3417LOCAL(PyObject*)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003418_pair(Py_ssize_t i1, Py_ssize_t i2)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003419{
3420 PyObject* pair;
3421 PyObject* item;
3422
3423 pair = PyTuple_New(2);
3424 if (!pair)
3425 return NULL;
3426
Christian Heimes217cfd12007-12-02 14:31:20 +00003427 item = PyLong_FromSsize_t(i1);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003428 if (!item)
3429 goto error;
3430 PyTuple_SET_ITEM(pair, 0, item);
3431
Christian Heimes217cfd12007-12-02 14:31:20 +00003432 item = PyLong_FromSsize_t(i2);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003433 if (!item)
3434 goto error;
3435 PyTuple_SET_ITEM(pair, 1, item);
3436
3437 return pair;
3438
3439 error:
3440 Py_DECREF(pair);
3441 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003442}
3443
3444static PyObject*
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003445match_span(MatchObject* self, PyObject* args)
Guido van Rossumb700df92000-03-31 14:59:30 +00003446{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003447 Py_ssize_t index;
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003448
Guido van Rossumddefaf32007-01-14 03:31:43 +00003449 PyObject* index_ = NULL;
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003450 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003451 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003452
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003453 index = match_getindex(self, index_);
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003454
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003455 if (index < 0 || index >= self->groups) {
3456 PyErr_SetString(
3457 PyExc_IndexError,
3458 "no such group"
3459 );
3460 return NULL;
3461 }
Guido van Rossumb700df92000-03-31 14:59:30 +00003462
Fredrik Lundh510c97b2000-09-02 16:36:57 +00003463 /* marks are -1 if group is undefined */
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003464 return _pair(self->mark[index*2], self->mark[index*2+1]);
3465}
3466
3467static PyObject*
3468match_regs(MatchObject* self)
3469{
3470 PyObject* regs;
3471 PyObject* item;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003472 Py_ssize_t index;
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003473
3474 regs = PyTuple_New(self->groups);
3475 if (!regs)
3476 return NULL;
3477
3478 for (index = 0; index < self->groups; index++) {
3479 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3480 if (!item) {
3481 Py_DECREF(regs);
3482 return NULL;
3483 }
3484 PyTuple_SET_ITEM(regs, index, item);
3485 }
3486
3487 Py_INCREF(regs);
3488 self->regs = regs;
3489
3490 return regs;
Guido van Rossumb700df92000-03-31 14:59:30 +00003491}
3492
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003493static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003494match_copy(MatchObject* self, PyObject *unused)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003495{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003496#ifdef USE_BUILTIN_COPY
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003497 MatchObject* copy;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003498 Py_ssize_t slots, offset;
Tim Peters3d563502006-01-21 02:47:53 +00003499
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003500 slots = 2 * (self->pattern->groups+1);
3501
3502 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3503 if (!copy)
3504 return NULL;
3505
3506 /* this value a constant, but any compiler should be able to
3507 figure that out all by itself */
3508 offset = offsetof(MatchObject, string);
3509
3510 Py_XINCREF(self->pattern);
3511 Py_XINCREF(self->string);
3512 Py_XINCREF(self->regs);
3513
3514 memcpy((char*) copy + offset, (char*) self + offset,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003515 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003516
3517 return (PyObject*) copy;
3518#else
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003519 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003520 return NULL;
3521#endif
3522}
3523
3524static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003525match_deepcopy(MatchObject* self, PyObject* memo)
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003526{
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003527#ifdef USE_BUILTIN_COPY
3528 MatchObject* copy;
Tim Peters3d563502006-01-21 02:47:53 +00003529
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003530 copy = (MatchObject*) match_copy(self);
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003531 if (!copy)
3532 return NULL;
3533
3534 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3535 !deepcopy(&copy->string, memo) ||
3536 !deepcopy(&copy->regs, memo)) {
3537 Py_DECREF(copy);
3538 return NULL;
3539 }
3540
3541#else
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003542 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3543 return NULL;
Fredrik Lundhd89a2e72001-07-03 20:32:36 +00003544#endif
Fredrik Lundhb0f05bd2001-07-02 16:42:49 +00003545}
3546
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003547PyDoc_STRVAR(match_doc,
3548"The result of re.match() and re.search().\n\
3549Match objects always have a boolean value of True.");
3550
3551PyDoc_STRVAR(match_group_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003552"group([group1, ...]) -> str or tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003553 Return subgroup(s) of the match by indices or names.\n\
3554 For 0 returns the entire match.");
3555
3556PyDoc_STRVAR(match_start_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003557"start([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003558 Return index of the start of the substring matched by group.");
3559
3560PyDoc_STRVAR(match_end_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003561"end([group=0]) -> int.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003562 Return index of the end of the substring matched by group.");
3563
3564PyDoc_STRVAR(match_span_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003565"span([group]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003566 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3567
3568PyDoc_STRVAR(match_groups_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003569"groups([default=None]) -> tuple.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003570 Return a tuple containing all the subgroups of the match, from 1.\n\
3571 The default argument is used for groups\n\
3572 that did not participate in the match");
3573
3574PyDoc_STRVAR(match_groupdict_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003575"groupdict([default=None]) -> dict.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003576 Return a dictionary containing all the named subgroups of the match,\n\
3577 keyed by the subgroup name. The default argument is used for groups\n\
3578 that did not participate in the match");
3579
3580PyDoc_STRVAR(match_expand_doc,
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003581"expand(template) -> str.\n\
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003582 Return the string obtained by doing backslash substitution\n\
3583 on the string template, as done by the sub() method.");
3584
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003585static PyMethodDef match_methods[] = {
Andrew Svetlov56ad5ed2012-12-23 19:23:07 +02003586 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3587 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3588 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3589 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3590 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3591 match_groups_doc},
3592 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3593 match_groupdict_doc},
3594 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003595 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3596 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003597 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003598};
3599
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003600static PyObject *
3601match_lastindex_get(MatchObject *self)
Guido van Rossumb700df92000-03-31 14:59:30 +00003602{
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003603 if (self->lastindex >= 0)
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01003604 return PyLong_FromSsize_t(self->lastindex);
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003605 Py_INCREF(Py_None);
3606 return Py_None;
Guido van Rossumb700df92000-03-31 14:59:30 +00003607}
3608
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003609static PyObject *
3610match_lastgroup_get(MatchObject *self)
3611{
3612 if (self->pattern->indexgroup && self->lastindex >= 0) {
3613 PyObject* result = PySequence_GetItem(
3614 self->pattern->indexgroup, self->lastindex
3615 );
3616 if (result)
3617 return result;
3618 PyErr_Clear();
3619 }
3620 Py_INCREF(Py_None);
3621 return Py_None;
3622}
3623
3624static PyObject *
3625match_regs_get(MatchObject *self)
3626{
3627 if (self->regs) {
3628 Py_INCREF(self->regs);
3629 return self->regs;
3630 } else
3631 return match_regs(self);
3632}
3633
3634static PyGetSetDef match_getset[] = {
3635 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3636 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3637 {"regs", (getter)match_regs_get, (setter)NULL},
3638 {NULL}
3639};
3640
3641#define MATCH_OFF(x) offsetof(MatchObject, x)
3642static PyMemberDef match_members[] = {
3643 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3644 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3645 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3646 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3647 {NULL}
3648};
3649
Guido van Rossumb700df92000-03-31 14:59:30 +00003650/* FIXME: implement setattr("string", None) as a special case (to
3651 detach the associated string, if any */
3652
Neal Norwitz57c179c2006-03-22 07:18:02 +00003653static PyTypeObject Match_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003654 PyVarObject_HEAD_INIT(NULL,0)
3655 "_" SRE_MODULE ".SRE_Match",
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003656 sizeof(MatchObject), sizeof(Py_ssize_t),
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003657 (destructor)match_dealloc, /* tp_dealloc */
3658 0, /* tp_print */
3659 0, /* tp_getattr */
3660 0, /* tp_setattr */
3661 0, /* tp_reserved */
3662 0, /* tp_repr */
3663 0, /* tp_as_number */
3664 0, /* tp_as_sequence */
3665 0, /* tp_as_mapping */
3666 0, /* tp_hash */
3667 0, /* tp_call */
3668 0, /* tp_str */
3669 0, /* tp_getattro */
3670 0, /* tp_setattro */
3671 0, /* tp_as_buffer */
3672 Py_TPFLAGS_DEFAULT, /* tp_flags */
Andrew Svetlov70dcef42012-12-23 19:59:27 +02003673 match_doc, /* tp_doc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003674 0, /* tp_traverse */
3675 0, /* tp_clear */
3676 0, /* tp_richcompare */
3677 0, /* tp_weaklistoffset */
3678 0, /* tp_iter */
3679 0, /* tp_iternext */
3680 match_methods, /* tp_methods */
3681 match_members, /* tp_members */
3682 match_getset, /* tp_getset */
Guido van Rossumb700df92000-03-31 14:59:30 +00003683};
3684
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003685static PyObject*
3686pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3687{
3688 /* create match object (from state object) */
3689
3690 MatchObject* match;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003691 Py_ssize_t i, j;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003692 char* base;
3693 int n;
3694
3695 if (status > 0) {
3696
3697 /* create match object (with room for extra group marks) */
Christian Heimes587c2bf2008-01-19 16:21:02 +00003698 /* coverity[ampersand_in_size] */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003699 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3700 2*(pattern->groups+1));
3701 if (!match)
3702 return NULL;
3703
3704 Py_INCREF(pattern);
3705 match->pattern = pattern;
3706
3707 Py_INCREF(state->string);
3708 match->string = state->string;
3709
3710 match->regs = NULL;
3711 match->groups = pattern->groups+1;
3712
3713 /* fill in group slices */
3714
3715 base = (char*) state->beginning;
3716 n = state->charsize;
3717
3718 match->mark[0] = ((char*) state->start - base) / n;
3719 match->mark[1] = ((char*) state->ptr - base) / n;
3720
3721 for (i = j = 0; i < pattern->groups; i++, j+=2)
3722 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3723 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3724 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3725 } else
3726 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3727
3728 match->pos = state->pos;
3729 match->endpos = state->endpos;
3730
3731 match->lastindex = state->lastindex;
3732
3733 return (PyObject*) match;
3734
3735 } else if (status == 0) {
3736
3737 /* no match */
3738 Py_INCREF(Py_None);
3739 return Py_None;
3740
3741 }
3742
3743 /* internal error */
3744 pattern_error(status);
3745 return NULL;
3746}
3747
3748
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003749/* -------------------------------------------------------------------- */
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003750/* scanner methods (experimental) */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003751
3752static void
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003753scanner_dealloc(ScannerObject* self)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003754{
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003755 state_fini(&self->state);
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003756 Py_XDECREF(self->pattern);
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003757 PyObject_DEL(self);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003758}
3759
3760static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003761scanner_match(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003762{
3763 SRE_STATE* state = &self->state;
3764 PyObject* match;
3765 int status;
3766
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003767 state_reset(state);
3768
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003769 state->ptr = state->start;
3770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 if (state->logical_charsize == 1) {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003772 status = sre_match(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003773 } else {
Gustavo Niemeyer2cbdc2a2003-12-13 20:32:08 +00003774 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003775 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003776 if (PyErr_Occurred())
3777 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003778
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003779 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003780 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003781
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003782 if (status == 0 || state->ptr == state->start)
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003783 state->start = (void*) ((char*) state->ptr + state->charsize);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003784 else
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003785 state->start = state->ptr;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003786
3787 return match;
3788}
3789
3790
3791static PyObject*
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003792scanner_search(ScannerObject* self, PyObject *unused)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003793{
3794 SRE_STATE* state = &self->state;
3795 PyObject* match;
3796 int status;
3797
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00003798 state_reset(state);
3799
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003800 state->ptr = state->start;
3801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802 if (state->logical_charsize == 1) {
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003803 status = sre_search(state, PatternObject_GetCode(self->pattern));
3804 } else {
3805 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3806 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003807 if (PyErr_Occurred())
3808 return NULL;
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003809
Fredrik Lundh75f2d672000-06-29 11:34:28 +00003810 match = pattern_new_match((PatternObject*) self->pattern,
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003811 state, status);
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003812
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00003813 if (status == 0 || state->ptr == state->start)
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003814 state->start = (void*) ((char*) state->ptr + state->charsize);
3815 else
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003816 state->start = state->ptr;
3817
3818 return match;
3819}
3820
Fredrik Lundhbe2211e2000-06-29 16:57:40 +00003821static PyMethodDef scanner_methods[] = {
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003822 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3823 {"search", (PyCFunction) scanner_search, METH_NOARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003824 {NULL, NULL}
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003825};
3826
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003827#define SCAN_OFF(x) offsetof(ScannerObject, x)
3828static PyMemberDef scanner_members[] = {
Ezio Melotti7c8c1ea2011-09-29 01:00:19 +03003829 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003830 {NULL} /* Sentinel */
3831};
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003832
Neal Norwitz57c179c2006-03-22 07:18:02 +00003833static PyTypeObject Scanner_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003834 PyVarObject_HEAD_INIT(NULL, 0)
3835 "_" SRE_MODULE ".SRE_Scanner",
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003836 sizeof(ScannerObject), 0,
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +00003837 (destructor)scanner_dealloc,/* tp_dealloc */
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003838 0, /* tp_print */
3839 0, /* tp_getattr */
3840 0, /* tp_setattr */
3841 0, /* tp_reserved */
3842 0, /* tp_repr */
3843 0, /* tp_as_number */
3844 0, /* tp_as_sequence */
3845 0, /* tp_as_mapping */
3846 0, /* tp_hash */
3847 0, /* tp_call */
3848 0, /* tp_str */
3849 0, /* tp_getattro */
3850 0, /* tp_setattro */
3851 0, /* tp_as_buffer */
3852 Py_TPFLAGS_DEFAULT, /* tp_flags */
3853 0, /* tp_doc */
3854 0, /* tp_traverse */
3855 0, /* tp_clear */
3856 0, /* tp_richcompare */
3857 0, /* tp_weaklistoffset */
3858 0, /* tp_iter */
3859 0, /* tp_iternext */
3860 scanner_methods, /* tp_methods */
3861 scanner_members, /* tp_members */
3862 0, /* tp_getset */
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +00003863};
3864
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003865static PyObject*
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003866pattern_scanner(PatternObject* pattern, PyObject* args, PyObject* kw)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003867{
3868 /* create search state object */
3869
3870 ScannerObject* self;
3871
3872 PyObject* string;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003873 Py_ssize_t start = 0;
3874 Py_ssize_t end = PY_SSIZE_T_MAX;
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06003875 static char* kwlist[] = { "source", "pos", "endpos", NULL };
3876 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:scanner", kwlist,
3877 &string, &start, &end))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003878 return NULL;
3879
3880 /* create scanner object */
3881 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3882 if (!self)
3883 return NULL;
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003884 self->pattern = NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003885
3886 string = state_init(&self->state, pattern, string, start, end);
3887 if (!string) {
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00003888 Py_DECREF(self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003889 return NULL;
3890 }
3891
3892 Py_INCREF(pattern);
3893 self->pattern = (PyObject*) pattern;
3894
3895 return (PyObject*) self;
3896}
3897
Guido van Rossumb700df92000-03-31 14:59:30 +00003898static PyMethodDef _functions[] = {
Neal Norwitzb0493252002-03-31 14:44:22 +00003899 {"compile", _compile, METH_VARARGS},
Thomas Wouters4d70c3d2006-06-08 14:42:34 +00003900 {"getcodesize", sre_codesize, METH_NOARGS},
Neal Norwitzb0493252002-03-31 14:44:22 +00003901 {"getlower", sre_getlower, METH_VARARGS},
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +00003902 {NULL, NULL}
Guido van Rossumb700df92000-03-31 14:59:30 +00003903};
3904
Martin v. Löwis1a214512008-06-11 05:26:20 +00003905static struct PyModuleDef sremodule = {
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003906 PyModuleDef_HEAD_INIT,
3907 "_" SRE_MODULE,
3908 NULL,
3909 -1,
3910 _functions,
3911 NULL,
3912 NULL,
3913 NULL,
3914 NULL
Martin v. Löwis1a214512008-06-11 05:26:20 +00003915};
3916
3917PyMODINIT_FUNC PyInit__sre(void)
Guido van Rossumb700df92000-03-31 14:59:30 +00003918{
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003919 PyObject* m;
3920 PyObject* d;
Barry Warsaw214a0b132001-08-16 20:33:48 +00003921 PyObject* x;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003922
Benjamin Peterson08bf91c2010-04-11 16:12:57 +00003923 /* Patch object types */
3924 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3925 PyType_Ready(&Scanner_Type))
Martin v. Löwis1a214512008-06-11 05:26:20 +00003926 return NULL;
Guido van Rossumb700df92000-03-31 14:59:30 +00003927
Martin v. Löwis1a214512008-06-11 05:26:20 +00003928 m = PyModule_Create(&sremodule);
Neal Norwitz1ac754f2006-01-19 06:09:39 +00003929 if (m == NULL)
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003930 return NULL;
Fredrik Lundhb35ffc02001-01-15 12:46:09 +00003931 d = PyModule_GetDict(m);
3932
Christian Heimes217cfd12007-12-02 14:31:20 +00003933 x = PyLong_FromLong(SRE_MAGIC);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003934 if (x) {
3935 PyDict_SetItemString(d, "MAGIC", x);
3936 Py_DECREF(x);
3937 }
Fredrik Lundh9c7eab82001-04-15 19:00:58 +00003938
Christian Heimes217cfd12007-12-02 14:31:20 +00003939 x = PyLong_FromLong(sizeof(SRE_CODE));
Martin v. Löwis78e2f062003-04-19 12:56:08 +00003940 if (x) {
3941 PyDict_SetItemString(d, "CODESIZE", x);
3942 Py_DECREF(x);
3943 }
3944
Neal Norwitzfe537132007-08-26 03:55:15 +00003945 x = PyUnicode_FromString(copyright);
Fredrik Lundh21009b92001-09-18 18:47:09 +00003946 if (x) {
3947 PyDict_SetItemString(d, "copyright", x);
3948 Py_DECREF(x);
3949 }
Martin v. Löwis1a214512008-06-11 05:26:20 +00003950 return m;
Guido van Rossumb700df92000-03-31 14:59:30 +00003951}
3952
Fredrik Lundh436c3d582000-06-29 08:58:44 +00003953#endif /* !defined(SRE_RECURSIVE) */
Gustavo Niemeyerbe733ee2003-04-20 07:35:44 +00003954
3955/* vim:ts=4:sw=4:et
3956*/