bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)

Also fixed searching patterns that could match an empty string.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index a9b6b50..68fc523 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -446,6 +446,8 @@
 
     state->isbytes = isbytes;
     state->charsize = charsize;
+    state->match_all = 0;
+    state->must_advance = 0;
 
     state->beginning = ptr;
 
@@ -559,14 +561,14 @@
 }
 
 LOCAL(Py_ssize_t)
-sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+sre_match(SRE_STATE* state, SRE_CODE* pattern)
 {
     if (state->charsize == 1)
-        return sre_ucs1_match(state, pattern, match_all);
+        return sre_ucs1_match(state, pattern, 1);
     if (state->charsize == 2)
-        return sre_ucs2_match(state, pattern, match_all);
+        return sre_ucs2_match(state, pattern, 1);
     assert(state->charsize == 4);
-    return sre_ucs4_match(state, pattern, match_all);
+    return sre_ucs4_match(state, pattern, 1);
 }
 
 LOCAL(Py_ssize_t)
@@ -606,7 +608,7 @@
 
     TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
 
-    status = sre_match(&state, PatternObject_GetCode(self), 0);
+    status = sre_match(&state, PatternObject_GetCode(self));
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
     if (PyErr_Occurred()) {
@@ -645,7 +647,8 @@
 
     TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
 
-    status = sre_match(&state, PatternObject_GetCode(self), 1);
+    state.match_all = 1;
+    status = sre_match(&state, PatternObject_GetCode(self));
 
     TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
     if (PyErr_Occurred()) {
@@ -808,11 +811,8 @@
         if (status < 0)
             goto error;
 
-        if (state.ptr == state.start)
-            state.start = (void*) ((char*) state.ptr + state.charsize);
-        else
-            state.start = state.ptr;
-
+        state.must_advance = (state.ptr == state.start);
+        state.start = state.ptr;
     }
 
     state_fini(&state);
@@ -901,17 +901,6 @@
     void* last;
 
     assert(self->codesize != 0);
-    if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
-        if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
-            PyErr_SetString(PyExc_ValueError,
-                            "split() requires a non-empty pattern match.");
-            return NULL;
-        }
-        if (PyErr_WarnEx(PyExc_FutureWarning,
-                         "split() requires a non-empty pattern match.",
-                         1) < 0)
-            return NULL;
-    }
 
     if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
         return NULL;
@@ -942,14 +931,6 @@
             goto error;
         }
 
-        if (state.start == state.ptr) {
-            if (last == state.end || state.ptr == state.end)
-                break;
-            /* skip one character */
-            state.start = (void*) ((char*) state.ptr + state.charsize);
-            continue;
-        }
-
         /* get segment before this match */
         item = getslice(state.isbytes, state.beginning,
             string, STATE_OFFSET(&state, last),
@@ -974,7 +955,7 @@
         }
 
         n = n + 1;
-
+        state.must_advance = 1;
         last = state.start = state.ptr;
 
     }
@@ -1101,9 +1082,7 @@
             if (status < 0)
                 goto error;
 
-        } else if (i == b && i == e && n > 0)
-            /* ignore empty match on latest position */
-            goto next;
+        }
 
         if (filter_is_callable) {
             /* pass match object through filter */
@@ -1130,16 +1109,8 @@
 
         i = e;
         n = n + 1;
-
-next:
-        /* move on */
-        if (state.ptr == state.end)
-            break;
-        if (state.ptr == state.start)
-            state.start = (void*) ((char*) state.ptr + state.charsize);
-        else
-            state.start = state.ptr;
-
+        state.must_advance = 1;
+        state.start = state.ptr;
     }
 
     /* get segment following last match */
@@ -2450,7 +2421,7 @@
 
     state->ptr = state->start;
 
-    status = sre_match(state, PatternObject_GetCode(self->pattern), 0);
+    status = sre_match(state, PatternObject_GetCode(self->pattern));
     if (PyErr_Occurred())
         return NULL;
 
@@ -2459,12 +2430,10 @@
 
     if (status == 0)
         state->start = NULL;
-    else if (state->ptr != state->start)
+    else {
+        state->must_advance = (state->ptr == state->start);
         state->start = state->ptr;
-    else if (state->ptr != state->end)
-        state->start = (void*) ((char*) state->ptr + state->charsize);
-    else
-        state->start = NULL;
+    }
 
     return match;
 }
@@ -2499,12 +2468,10 @@
 
     if (status == 0)
         state->start = NULL;
-    else if (state->ptr != state->start)
+    else {
+        state->must_advance = (state->ptr == state->start);
         state->start = state->ptr;
-    else if (state->ptr != state->end)
-        state->start = (void*) ((char*) state->ptr + state->charsize);
-    else
-        state->start = NULL;
+    }
 
     return match;
 }
diff --git a/Modules/sre.h b/Modules/sre.h
index 585d284..a728488 100644
--- a/Modules/sre.h
+++ b/Modules/sre.h
@@ -67,6 +67,7 @@
     void* end; /* end of original string */
     /* attributes for the match object */
     PyObject* string;
+    Py_buffer buffer;
     Py_ssize_t pos, endpos;
     int isbytes;
     int charsize; /* character size */
@@ -74,11 +75,12 @@
     Py_ssize_t lastindex;
     Py_ssize_t lastmark;
     void** mark;
+    int match_all;
+    int must_advance;
     /* dynamically allocated stuff */
     char* data_stack;
     size_t data_stack_size;
     size_t data_stack_base;
-    Py_buffer buffer;
     /* current repeat context */
     SRE_REPEAT *repeat;
 } SRE_STATE;
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index e13b90e..44948e2 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -199,7 +199,7 @@
     return up != lo && SRE(charset)(state, set, up);
 }
 
-LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
+LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel);
 
 LOCAL(Py_ssize_t)
 SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
@@ -510,12 +510,12 @@
 #define JUMP_ASSERT          12
 #define JUMP_ASSERT_NOT      13
 
-#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \
+#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \
     DATA_ALLOC(SRE(match_context), nextctx); \
     nextctx->last_ctx_pos = ctx_pos; \
     nextctx->jump = jumpvalue; \
     nextctx->pattern = nextpattern; \
-    nextctx->match_all = matchall; \
+    nextctx->toplevel = toplevel_; \
     ctx_pos = alloc_pos; \
     ctx = nextctx; \
     goto entrance; \
@@ -523,7 +523,7 @@
     while (0) /* gcc doesn't like labels at end of scopes */ \
 
 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
-    DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all)
+    DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel)
 
 #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \
     DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
@@ -540,13 +540,13 @@
         SRE_CODE chr;
         SRE_REPEAT* rep;
     } u;
-    int match_all;
+    int toplevel;
 } SRE(match_context);
 
 /* check if string matches the given pattern.  returns <0 for
    error, 0 for failure, and 1 for success */
 LOCAL(Py_ssize_t)
-SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all)
+SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
 {
     SRE_CHAR* end = (SRE_CHAR *)state->end;
     Py_ssize_t alloc_pos, ctx_pos = -1;
@@ -563,7 +563,7 @@
     ctx->last_ctx_pos = -1;
     ctx->jump = JUMP_NONE;
     ctx->pattern = pattern;
-    ctx->match_all = match_all;
+    ctx->toplevel = toplevel;
     ctx_pos = alloc_pos;
 
 entrance:
@@ -636,11 +636,14 @@
         case SRE_OP_SUCCESS:
             /* end of pattern */
             TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
-            if (!ctx->match_all || ctx->ptr == state->end) {
-                state->ptr = ctx->ptr;
-                RETURN_SUCCESS;
+            if (ctx->toplevel &&
+                ((state->match_all && ctx->ptr != state->end) ||
+                 (state->must_advance && ctx->ptr == state->start)))
+            {
+                RETURN_FAILURE;
             }
-            RETURN_FAILURE;
+            state->ptr = ctx->ptr;
+            RETURN_SUCCESS;
 
         case SRE_OP_AT:
             /* match at given position */
@@ -856,7 +859,9 @@
                 RETURN_FAILURE;
 
             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
-                ctx->ptr == state->end) {
+                ctx->ptr == state->end &&
+                !(ctx->toplevel && state->must_advance && ctx->ptr == state->start))
+            {
                 /* tail is empty.  we're finished */
                 state->ptr = ctx->ptr;
                 RETURN_SUCCESS;
@@ -941,7 +946,10 @@
             }
 
             if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS &&
-                (!match_all || ctx->ptr == state->end)) {
+                !(ctx->toplevel &&
+                  ((state->match_all && ctx->ptr != state->end) ||
+                   (state->must_advance && ctx->ptr == state->start))))
+            {
                 /* tail is empty.  we're finished */
                 state->ptr = ctx->ptr;
                 RETURN_SUCCESS;
@@ -1417,6 +1425,7 @@
             return 0; /* literal can't match: doesn't fit in char width */
 #endif
         end = (SRE_CHAR *)state->end;
+        state->must_advance = 0;
         while (ptr < end) {
             while (*ptr != c) {
                 if (++ptr >= end)
@@ -1458,6 +1467,7 @@
                 return 0;
 
             i = 1;
+            state->must_advance = 0;
             do {
                 if (*ptr == (SRE_CHAR) prefix[i]) {
                     if (++i != prefix_len) {
@@ -1487,6 +1497,7 @@
     if (charset) {
         /* pattern starts with a character from a known set */
         end = (SRE_CHAR *)state->end;
+        state->must_advance = 0;
         for (;;) {
             while (ptr < end && !SRE(charset)(state, charset, *ptr))
                 ptr++;
@@ -1503,13 +1514,15 @@
     } else {
         /* general case */
         assert(ptr <= end);
-        while (1) {
+        TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
+        state->start = state->ptr = ptr;
+        status = SRE(match)(state, pattern, 1);
+        state->must_advance = 0;
+        while (status == 0 && ptr < end) {
+            ptr++;
             TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
             state->start = state->ptr = ptr;
             status = SRE(match)(state, pattern, 0);
-            if (status != 0 || ptr >= end)
-                break;
-            ptr++;
         }
     }