bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)
Also fixed searching patterns that could match an empty string.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index a9b6b50..68fc523 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -446,6 +446,8 @@
state->isbytes = isbytes;
state->charsize = charsize;
+ state->match_all = 0;
+ state->must_advance = 0;
state->beginning = ptr;
@@ -559,14 +561,14 @@
}
LOCAL(Py_ssize_t)
-sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+sre_match(SRE_STATE* state, SRE_CODE* pattern)
{
if (state->charsize == 1)
- return sre_ucs1_match(state, pattern, match_all);
+ return sre_ucs1_match(state, pattern, 1);
if (state->charsize == 2)
- return sre_ucs2_match(state, pattern, match_all);
+ return sre_ucs2_match(state, pattern, 1);
assert(state->charsize == 4);
- return sre_ucs4_match(state, pattern, match_all);
+ return sre_ucs4_match(state, pattern, 1);
}
LOCAL(Py_ssize_t)
@@ -606,7 +608,7 @@
TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
- status = sre_match(&state, PatternObject_GetCode(self), 0);
+ status = sre_match(&state, PatternObject_GetCode(self));
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
@@ -645,7 +647,8 @@
TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
- status = sre_match(&state, PatternObject_GetCode(self), 1);
+ state.match_all = 1;
+ status = sre_match(&state, PatternObject_GetCode(self));
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
@@ -808,11 +811,8 @@
if (status < 0)
goto error;
- if (state.ptr == state.start)
- state.start = (void*) ((char*) state.ptr + state.charsize);
- else
- state.start = state.ptr;
-
+ state.must_advance = (state.ptr == state.start);
+ state.start = state.ptr;
}
state_fini(&state);
@@ -901,17 +901,6 @@
void* last;
assert(self->codesize != 0);
- if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
- if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
- PyErr_SetString(PyExc_ValueError,
- "split() requires a non-empty pattern match.");
- return NULL;
- }
- if (PyErr_WarnEx(PyExc_FutureWarning,
- "split() requires a non-empty pattern match.",
- 1) < 0)
- return NULL;
- }
if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
return NULL;
@@ -942,14 +931,6 @@
goto error;
}
- if (state.start == state.ptr) {
- if (last == state.end || state.ptr == state.end)
- break;
- /* skip one character */
- state.start = (void*) ((char*) state.ptr + state.charsize);
- continue;
- }
-
/* get segment before this match */
item = getslice(state.isbytes, state.beginning,
string, STATE_OFFSET(&state, last),
@@ -974,7 +955,7 @@
}
n = n + 1;
-
+ state.must_advance = 1;
last = state.start = state.ptr;
}
@@ -1101,9 +1082,7 @@
if (status < 0)
goto error;
- } else if (i == b && i == e && n > 0)
- /* ignore empty match on latest position */
- goto next;
+ }
if (filter_is_callable) {
/* pass match object through filter */
@@ -1130,16 +1109,8 @@
i = e;
n = n + 1;
-
-next:
- /* move on */
- if (state.ptr == state.end)
- break;
- if (state.ptr == state.start)
- state.start = (void*) ((char*) state.ptr + state.charsize);
- else
- state.start = state.ptr;
-
+ state.must_advance = 1;
+ state.start = state.ptr;
}
/* get segment following last match */
@@ -2450,7 +2421,7 @@
state->ptr = state->start;
- status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
+ status = sre_match(state, PatternObject_GetCode(self->pattern));
if (PyErr_Occurred())
return NULL;
@@ -2459,12 +2430,10 @@
if (status == 0)
state->start = NULL;
- else if (state->ptr != state->start)
+ else {
+ state->must_advance = (state->ptr == state->start);
state->start = state->ptr;
- else if (state->ptr != state->end)
- state->start = (void*) ((char*) state->ptr + state->charsize);
- else
- state->start = NULL;
+ }
return match;
}
@@ -2499,12 +2468,10 @@
if (status == 0)
state->start = NULL;
- else if (state->ptr != state->start)
+ else {
+ state->must_advance = (state->ptr == state->start);
state->start = state->ptr;
- else if (state->ptr != state->end)
- state->start = (void*) ((char*) state->ptr + state->charsize);
- else
- state->start = NULL;
+ }
return match;
}