Issue #7622: Improve the split(), rsplit(), splitlines() and replace()
methods of bytes, bytearray and unicode objects by using a common
implementation based on stringlib's fast search. Patch by Florent Xicluna.
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index 0f3874e..43ef3fa 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -841,6 +841,7 @@
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/partition.h"
+#include "stringlib/split.h"
#define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping
#include "stringlib/localeutil.h"
@@ -1425,145 +1426,6 @@
#define STRIPNAME(i) (stripformat[i]+3)
-
-/* Don't call if length < 2 */
-#define Py_STRING_MATCH(target, offset, pattern, length) \
- (target[offset] == pattern[0] && \
- target[offset+length-1] == pattern[length-1] && \
- !memcmp(target+offset+1, pattern+1, length-2) )
-
-
-/* Overallocate the initial list to reduce the number of reallocs for small
- split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
- resizes, to sizes 4, 8, then 16. Most observed string splits are for human
- text (roughly 11 words per line) and field delimited data (usually 1-10
- fields). For large strings the split algorithms are bandwidth limited
- so increasing the preallocation likely will not improve things.*/
-
-#define MAX_PREALLOC 12
-
-/* 5 splits gives 6 elements */
-#define PREALLOC_SIZE(maxsplit) \
- (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
-
-#define SPLIT_APPEND(data, left, right) \
- str = PyString_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-#define SPLIT_ADD(data, left, right) { \
- str = PyString_FromStringAndSize((data) + (left), \
- (right) - (left)); \
- if (str == NULL) \
- goto onError; \
- if (count < MAX_PREALLOC) { \
- PyList_SET_ITEM(list, count, str); \
- } else { \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str); \
- } \
- count++; }
-
-/* Always force the list to the expected size. */
-#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
-
-#define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
-#define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
-#define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
-#define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
-
-Py_LOCAL_INLINE(PyObject *)
-split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
-{
- const char *s = PyString_AS_STRING(self);
- Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
-
- if (list == NULL)
- return NULL;
-
- i = j = 0;
-
- while (maxsplit-- > 0) {
- SKIP_SPACE(s, i, len);
- if (i==len) break;
- j = i; i++;
- SKIP_NONSPACE(s, i, len);
- if (j == 0 && i == len && PyString_CheckExact(self)) {
- /* No whitespace in self, so just use it as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- break;
- }
- SPLIT_ADD(s, j, i);
- }
-
- if (i < len) {
- /* Only occurs when maxsplit was reached */
- /* Skip any remaining whitespace and copy to end of string */
- SKIP_SPACE(s, i, len);
- if (i != len)
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-Py_LOCAL_INLINE(PyObject *)
-split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
-{
- const char *s = PyString_AS_STRING(self);
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- i = j = 0;
- while ((j < len) && (maxcount-- > 0)) {
- for(; j<len; j++) {
- /* I found that using memchr makes no difference */
- if (s[j] == ch) {
- SPLIT_ADD(s, i, j);
- i = j = j + 1;
- break;
- }
- }
- }
- if (i == 0 && count == 0 && PyString_CheckExact(self)) {
- /* ch not in self, so just use self as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- }
- else if (i <= len) {
- SPLIT_ADD(s, i, len);
- }
- FIX_PREALLOC_SIZE(list);
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
PyDoc_STRVAR(split__doc__,
"S.split([sep [,maxsplit]]) -> list of strings\n\
\n\
@@ -1576,17 +1438,17 @@
static PyObject *
string_split(PyStringObject *self, PyObject *args)
{
- Py_ssize_t len = PyString_GET_SIZE(self), n, i, j, pos;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t len = PyString_GET_SIZE(self), n;
+ Py_ssize_t maxsplit = -1;
const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
+ PyObject *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
- return split_whitespace(self, len, maxsplit);
+ return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
@@ -1598,33 +1460,7 @@
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else if (n == 1)
- return split_char(self, len, sub[0], maxsplit);
-
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
-
- i = j = 0;
- while (maxsplit-- > 0) {
- pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
- if (pos < 0)
- break;
- j = i + pos;
- SPLIT_ADD(s, i, j);
- i = j + n;
- }
- SPLIT_ADD(s, i, len);
- FIX_PREALLOC_SIZE(list);
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
+ return stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
}
PyDoc_STRVAR(partition__doc__,
@@ -1689,90 +1525,6 @@
);
}
-Py_LOCAL_INLINE(PyObject *)
-rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
-{
- const char *s = PyString_AS_STRING(self);
- Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
-
- if (list == NULL)
- return NULL;
-
- i = j = len-1;
-
- while (maxsplit-- > 0) {
- RSKIP_SPACE(s, i);
- if (i<0) break;
- j = i; i--;
- RSKIP_NONSPACE(s, i);
- if (j == len-1 && i < 0 && PyString_CheckExact(self)) {
- /* No whitespace in self, so just use it as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- break;
- }
- SPLIT_ADD(s, i + 1, j + 1);
- }
- if (i >= 0) {
- /* Only occurs when maxsplit was reached */
- /* Skip any remaining whitespace and copy to beginning of string */
- RSKIP_SPACE(s, i);
- if (i >= 0)
- SPLIT_ADD(s, 0, i + 1);
-
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-Py_LOCAL_INLINE(PyObject *)
-rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
-{
- const char *s = PyString_AS_STRING(self);
- register Py_ssize_t i, j, count=0;
- PyObject *str;
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
-
- if (list == NULL)
- return NULL;
-
- i = j = len - 1;
- while ((i >= 0) && (maxcount-- > 0)) {
- for (; i >= 0; i--) {
- if (s[i] == ch) {
- SPLIT_ADD(s, i + 1, j + 1);
- j = i = i - 1;
- break;
- }
- }
- }
- if (i < 0 && count == 0 && PyString_CheckExact(self)) {
- /* ch not in self, so just use self as list[0] */
- Py_INCREF(self);
- PyList_SET_ITEM(list, 0, (PyObject *)self);
- count++;
- }
- else if (j >= -1) {
- SPLIT_ADD(s, 0, j + 1);
- }
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
PyDoc_STRVAR(rsplit__doc__,
"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
\n\
@@ -1785,17 +1537,17 @@
static PyObject *
string_rsplit(PyStringObject *self, PyObject *args)
{
- Py_ssize_t len = PyString_GET_SIZE(self), n, j, pos;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t len = PyString_GET_SIZE(self), n;
+ Py_ssize_t maxsplit = -1;
const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj = Py_None;
+ PyObject *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None)
- return rsplit_whitespace(self, len, maxsplit);
+ return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
@@ -1807,35 +1559,7 @@
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL;
- if (n == 0) {
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else if (n == 1)
- return rsplit_char(self, len, sub[0], maxsplit);
-
- list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
- return NULL;
-
- j = len;
-
- while (maxsplit-- > 0) {
- pos = fastsearch(s, j, sub, n, FAST_RSEARCH);
- if (pos < 0)
- break;
- SPLIT_ADD(s, pos + n, j);
- j = pos;
- }
- SPLIT_ADD(s, 0, j);
- FIX_PREALLOC_SIZE(list);
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
-onError:
- Py_DECREF(list);
- return NULL;
+ return stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit);
}
@@ -1950,20 +1674,20 @@
return string_join((PyStringObject *)sep, x);
}
-Py_LOCAL_INLINE(void)
-string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
-{
- if (*end > len)
- *end = len;
- else if (*end < 0)
- *end += len;
- if (*end < 0)
- *end = 0;
- if (*start < 0)
- *start += len;
- if (*start < 0)
- *start = 0;
-}
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
Py_LOCAL_INLINE(Py_ssize_t)
string_find_internal(PyStringObject *self, PyObject *args, int dir)
@@ -2417,10 +2141,10 @@
else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
return NULL;
- string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
+ ADJUST_INDICES(start, end, PyString_GET_SIZE(self));
return PyInt_FromSsize_t(
- stringlib_count(str + start, end - start, sub, sub_len)
+ stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
);
}
@@ -2583,9 +2307,6 @@
}
-#define FORWARD 1
-#define REVERSE -1
-
/* find and count characters and substrings */
#define findchar(target, target_len, c) \
@@ -2621,93 +2342,6 @@
return count;
}
-Py_LOCAL(Py_ssize_t)
-findstring(const char *target, Py_ssize_t target_len,
- const char *pattern, Py_ssize_t pattern_len,
- Py_ssize_t start,
- Py_ssize_t end,
- int direction)
-{
- if (start < 0) {
- start += target_len;
- if (start < 0)
- start = 0;
- }
- if (end > target_len) {
- end = target_len;
- } else if (end < 0) {
- end += target_len;
- if (end < 0)
- end = 0;
- }
-
- /* zero-length substrings always match at the first attempt */
- if (pattern_len == 0)
- return (direction > 0) ? start : end;
-
- end -= pattern_len;
-
- if (direction < 0) {
- for (; end >= start; end--)
- if (Py_STRING_MATCH(target, end, pattern, pattern_len))
- return end;
- } else {
- for (; start <= end; start++)
- if (Py_STRING_MATCH(target, start, pattern, pattern_len))
- return start;
- }
- return -1;
-}
-
-Py_LOCAL_INLINE(Py_ssize_t)
-countstring(const char *target, Py_ssize_t target_len,
- const char *pattern, Py_ssize_t pattern_len,
- Py_ssize_t start,
- Py_ssize_t end,
- int direction, Py_ssize_t maxcount)
-{
- Py_ssize_t count=0;
-
- if (start < 0) {
- start += target_len;
- if (start < 0)
- start = 0;
- }
- if (end > target_len) {
- end = target_len;
- } else if (end < 0) {
- end += target_len;
- if (end < 0)
- end = 0;
- }
-
- /* zero-length substrings match everywhere */
- if (pattern_len == 0 || maxcount == 0) {
- if (target_len+1 < maxcount)
- return target_len+1;
- return maxcount;
- }
-
- end -= pattern_len;
- if (direction < 0) {
- for (; (end >= start); end--)
- if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
- count++;
- if (--maxcount <= 0) break;
- end -= pattern_len-1;
- }
- } else {
- for (; (start <= end); start++)
- if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
- count++;
- if (--maxcount <= 0)
- break;
- start += pattern_len-1;
- }
- }
- return count;
-}
-
/* Algorithms for different cases of string replacement */
@@ -2828,10 +2462,9 @@
self_len = PyString_GET_SIZE(self);
self_s = PyString_AS_STRING(self);
- count = countstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, 1,
- maxcount);
+ count = stringlib_count(self_s, self_len,
+ from_s, from_len,
+ maxcount);
if (count == 0) {
/* no matches */
@@ -2850,9 +2483,9 @@
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset == -1)
break;
next = start + offset;
@@ -2928,9 +2561,9 @@
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
- offset = findstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, FORWARD);
+ offset = stringlib_find(self_s, self_len,
+ from_s, from_len,
+ 0);
if (offset == -1) {
/* No matches; return the original string */
return return_self(self);
@@ -2950,9 +2583,9 @@
end = result_s + self_len;
while ( --maxcount > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset==-1)
break;
Py_MEMCPY(start+offset, to_s, from_len);
@@ -3044,9 +2677,10 @@
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
- count = countstring(self_s, self_len,
- from_s, from_len,
- 0, self_len, FORWARD, maxcount);
+ count = stringlib_count(self_s, self_len,
+ from_s, from_len,
+ maxcount);
+
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
@@ -3073,9 +2707,9 @@
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
- offset = findstring(start, end-start,
- from_s, from_len,
- 0, end-start, FORWARD);
+ offset = stringlib_find(start, end-start,
+ from_s, from_len,
+ 0);
if (offset == -1)
break;
next = start+offset;
@@ -3245,7 +2879,7 @@
return -1;
str = PyString_AS_STRING(self);
- string_adjust_indices(&start, &end, len);
+ ADJUST_INDICES(start, end, len);
if (direction < 0) {
/* startswith */
@@ -3913,62 +3547,15 @@
static PyObject*
string_splitlines(PyStringObject *self, PyObject *args)
{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len;
int keepends = 0;
- PyObject *list;
- PyObject *str;
- char *data;
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
return NULL;
- data = PyString_AS_STRING(self);
- len = PyString_GET_SIZE(self);
-
- /* This does not use the preallocated list because splitlines is
- usually run with hundreds of newlines. The overhead of
- switching between PyList_SET_ITEM and append causes about a
- 2-3% slowdown for that common case. A smarter implementation
- could move the if check out, so the SET_ITEMs are done first
- and the appends only done when the prealloc buffer is full.
- That's too much work for little gain.*/
-
- list = PyList_New(0);
- if (!list)
- goto onError;
-
- for (i = j = 0; i < len; ) {
- Py_ssize_t eol;
-
- /* Find a line and append it */
- while (i < len && data[i] != '\n' && data[i] != '\r')
- i++;
-
- /* Skip the line break reading CRLF as one line break */
- eol = i;
- if (i < len) {
- if (data[i] == '\r' && i + 1 < len &&
- data[i+1] == '\n')
- i += 2;
- else
- i++;
- if (keepends)
- eol = i;
- }
- SPLIT_APPEND(data, j, eol);
- j = i;
- }
- if (j < len) {
- SPLIT_APPEND(data, j, len);
- }
-
- return list;
-
- onError:
- Py_XDECREF(list);
- return NULL;
+ return stringlib_splitlines(
+ (PyObject*) self, PyString_AS_STRING(self), PyString_GET_SIZE(self),
+ keepends
+ );
}
PyDoc_STRVAR(sizeof__doc__,
@@ -3982,11 +3569,6 @@
return PyInt_FromSsize_t(res);
}
-#undef SPLIT_APPEND
-#undef SPLIT_ADD
-#undef MAX_PREALLOC
-#undef PREALLOC_SIZE
-
static PyObject *
string_getnewargs(PyStringObject *v)
{