| /* stringlib: split implementation */ |
| |
| #ifndef STRINGLIB_SPLIT_H |
| #define STRINGLIB_SPLIT_H |
| |
| #ifndef STRINGLIB_FASTSEARCH_H |
| #error must include "stringlib/fastsearch.h" before including this module |
| #endif |
| |
| /* Overallocate the initial list to reduce the number of reallocs for small |
| split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three |
| resizes, to sizes 4, 8, then 16. Most observed string splits are for human |
| text (roughly 11 words per line) and field delimited data (usually 1-10 |
| fields). For large strings the split algorithms are bandwidth limited |
| so increasing the preallocation likely will not improve things.*/ |
| |
| #define MAX_PREALLOC 12 |
| |
| /* 5 splits gives 6 elements */ |
| #define PREALLOC_SIZE(maxsplit) \ |
| (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) |
| |
| #define SPLIT_APPEND(data, left, right) \ |
| sub = STRINGLIB_NEW((data) + (left), \ |
| (right) - (left)); \ |
| if (sub == NULL) \ |
| goto onError; \ |
| if (PyList_Append(list, sub)) { \ |
| Py_DECREF(sub); \ |
| goto onError; \ |
| } \ |
| else \ |
| Py_DECREF(sub); |
| |
| #define SPLIT_ADD(data, left, right) { \ |
| sub = STRINGLIB_NEW((data) + (left), \ |
| (right) - (left)); \ |
| if (sub == NULL) \ |
| goto onError; \ |
| if (count < MAX_PREALLOC) { \ |
| PyList_SET_ITEM(list, count, sub); \ |
| } else { \ |
| if (PyList_Append(list, sub)) { \ |
| Py_DECREF(sub); \ |
| goto onError; \ |
| } \ |
| else \ |
| Py_DECREF(sub); \ |
| } \ |
| count++; } |
| |
| |
| /* Always force the list to the expected size. */ |
| #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_split_whitespace(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| Py_ssize_t maxcount) |
| { |
| Py_ssize_t i, j, count=0; |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
| PyObject *sub; |
| |
| if (list == NULL) |
| return NULL; |
| |
| i = j = 0; |
| while (maxcount-- > 0) { |
| while (i < str_len && STRINGLIB_ISSPACE(str[i])) |
| i++; |
| if (i == str_len) break; |
| j = i; i++; |
| while (i < str_len && !STRINGLIB_ISSPACE(str[i])) |
| i++; |
| #ifndef STRINGLIB_MUTABLE |
| if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* No whitespace in str_obj, so just use it as list[0] */ |
| Py_INCREF(str_obj); |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
| count++; |
| break; |
| } |
| #endif |
| SPLIT_ADD(str, j, i); |
| } |
| |
| if (i < str_len) { |
| /* Only occurs when maxcount was reached */ |
| /* Skip any remaining whitespace and copy to end of string */ |
| while (i < str_len && STRINGLIB_ISSPACE(str[i])) |
| i++; |
| if (i != str_len) |
| SPLIT_ADD(str, i, str_len); |
| } |
| FIX_PREALLOC_SIZE(list); |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_split_char(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| const STRINGLIB_CHAR ch, |
| Py_ssize_t maxcount) |
| { |
| Py_ssize_t i, j, count=0; |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
| PyObject *sub; |
| |
| if (list == NULL) |
| return NULL; |
| |
| i = j = 0; |
| while ((j < str_len) && (maxcount-- > 0)) { |
| for(; j < str_len; j++) { |
| /* I found that using memchr makes no difference */ |
| if (str[j] == ch) { |
| SPLIT_ADD(str, i, j); |
| i = j = j + 1; |
| break; |
| } |
| } |
| } |
| #ifndef STRINGLIB_MUTABLE |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* ch not in str_obj, so just use str_obj as list[0] */ |
| Py_INCREF(str_obj); |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
| count++; |
| } else |
| #endif |
| if (i <= str_len) { |
| SPLIT_ADD(str, i, str_len); |
| } |
| FIX_PREALLOC_SIZE(list); |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_split(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, |
| Py_ssize_t maxcount) |
| { |
| Py_ssize_t i, j, pos, count=0; |
| PyObject *list, *sub; |
| |
| if (sep_len == 0) { |
| PyErr_SetString(PyExc_ValueError, "empty separator"); |
| return NULL; |
| } |
| else if (sep_len == 1) |
| return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); |
| |
| list = PyList_New(PREALLOC_SIZE(maxcount)); |
| if (list == NULL) |
| return NULL; |
| |
| i = j = 0; |
| while (maxcount-- > 0) { |
| pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); |
| if (pos < 0) |
| break; |
| j = i + pos; |
| SPLIT_ADD(str, i, j); |
| i = j + sep_len; |
| } |
| #ifndef STRINGLIB_MUTABLE |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* No match in str_obj, so just use it as list[0] */ |
| Py_INCREF(str_obj); |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
| count++; |
| } else |
| #endif |
| { |
| SPLIT_ADD(str, i, str_len); |
| } |
| FIX_PREALLOC_SIZE(list); |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_rsplit_whitespace(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| Py_ssize_t maxcount) |
| { |
| Py_ssize_t i, j, count=0; |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
| PyObject *sub; |
| |
| if (list == NULL) |
| return NULL; |
| |
| i = j = str_len - 1; |
| while (maxcount-- > 0) { |
| while (i >= 0 && STRINGLIB_ISSPACE(str[i])) |
| i--; |
| if (i < 0) break; |
| j = i; i--; |
| while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) |
| i--; |
| #ifndef STRINGLIB_MUTABLE |
| if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* No whitespace in str_obj, so just use it as list[0] */ |
| Py_INCREF(str_obj); |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
| count++; |
| break; |
| } |
| #endif |
| SPLIT_ADD(str, i + 1, j + 1); |
| } |
| |
| if (i >= 0) { |
| /* Only occurs when maxcount was reached */ |
| /* Skip any remaining whitespace and copy to beginning of string */ |
| while (i >= 0 && STRINGLIB_ISSPACE(str[i])) |
| i--; |
| if (i >= 0) |
| SPLIT_ADD(str, 0, i + 1); |
| } |
| FIX_PREALLOC_SIZE(list); |
| if (PyList_Reverse(list) < 0) |
| goto onError; |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_rsplit_char(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| const STRINGLIB_CHAR ch, |
| Py_ssize_t maxcount) |
| { |
| Py_ssize_t i, j, count=0; |
| PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); |
| PyObject *sub; |
| |
| if (list == NULL) |
| return NULL; |
| |
| i = j = str_len - 1; |
| while ((i >= 0) && (maxcount-- > 0)) { |
| for(; i >= 0; i--) { |
| if (str[i] == ch) { |
| SPLIT_ADD(str, i + 1, j + 1); |
| j = i = i - 1; |
| break; |
| } |
| } |
| } |
| #ifndef STRINGLIB_MUTABLE |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* ch not in str_obj, so just use str_obj as list[0] */ |
| Py_INCREF(str_obj); |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
| count++; |
| } else |
| #endif |
| if (j >= -1) { |
| SPLIT_ADD(str, 0, j + 1); |
| } |
| FIX_PREALLOC_SIZE(list); |
| if (PyList_Reverse(list) < 0) |
| goto onError; |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_rsplit(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, |
| Py_ssize_t maxcount) |
| { |
| Py_ssize_t j, pos, count=0; |
| PyObject *list, *sub; |
| |
| if (sep_len == 0) { |
| PyErr_SetString(PyExc_ValueError, "empty separator"); |
| return NULL; |
| } |
| else if (sep_len == 1) |
| return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); |
| |
| list = PyList_New(PREALLOC_SIZE(maxcount)); |
| if (list == NULL) |
| return NULL; |
| |
| j = str_len; |
| while (maxcount-- > 0) { |
| pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); |
| if (pos < 0) |
| break; |
| SPLIT_ADD(str, pos + sep_len, j); |
| j = pos; |
| } |
| #ifndef STRINGLIB_MUTABLE |
| if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* No match in str_obj, so just use it as list[0] */ |
| Py_INCREF(str_obj); |
| PyList_SET_ITEM(list, 0, (PyObject *)str_obj); |
| count++; |
| } else |
| #endif |
| { |
| SPLIT_ADD(str, 0, j); |
| } |
| FIX_PREALLOC_SIZE(list); |
| if (PyList_Reverse(list) < 0) |
| goto onError; |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| Py_LOCAL_INLINE(PyObject *) |
| stringlib_splitlines(PyObject* str_obj, |
| const STRINGLIB_CHAR* str, Py_ssize_t str_len, |
| int keepends) |
| { |
| /* This does not use the preallocated list because splitlines is |
| usually run with hundreds of newlines. The overhead of |
| switching between PyList_SET_ITEM and append causes about a |
| 2-3% slowdown for that common case. A smarter implementation |
| could move the if check out, so the SET_ITEMs are done first |
| and the appends only done when the prealloc buffer is full. |
| That's too much work for little gain.*/ |
| |
| register Py_ssize_t i; |
| register Py_ssize_t j; |
| PyObject *list = PyList_New(0); |
| PyObject *sub; |
| |
| if (list == NULL) |
| return NULL; |
| |
| for (i = j = 0; i < str_len; ) { |
| Py_ssize_t eol; |
| |
| /* Find a line and append it */ |
| while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) |
| i++; |
| |
| /* Skip the line break reading CRLF as one line break */ |
| eol = i; |
| if (i < str_len) { |
| if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') |
| i += 2; |
| else |
| i++; |
| if (keepends) |
| eol = i; |
| } |
| #ifndef STRINGLIB_MUTABLE |
| if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { |
| /* No linebreak in str_obj, so just use it as list[0] */ |
| if (PyList_Append(list, str_obj)) |
| goto onError; |
| break; |
| } |
| #endif |
| SPLIT_APPEND(str, j, eol); |
| j = i; |
| } |
| return list; |
| |
| onError: |
| Py_DECREF(list); |
| return NULL; |
| } |
| |
| #endif |