Issue #11114: Fix catastrophic performance of tell() on text files (up
to 1000x faster in some cases). It is still one to two order of magnitudes
slower than binary tell().
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index a5d6135..ad5bfcc 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -1488,6 +1488,7 @@
self._decoded_chars_used = 0 # offset into _decoded_chars for read()
self._snapshot = None # info for reconstructing decoder state
self._seekable = self._telling = self.buffer.seekable()
+ self._b2cratio = 0.0
if self._seekable and self.writable():
position = self.buffer.tell()
@@ -1655,7 +1656,12 @@
# Read a chunk, decode it, and put the result in self._decoded_chars.
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
eof = not input_chunk
- self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
+ decoded_chars = self._decoder.decode(input_chunk, eof)
+ self._set_decoded_chars(decoded_chars)
+ if decoded_chars:
+ self._b2cratio = len(input_chunk) / len(self._decoded_chars)
+ else:
+ self._b2cratio = 0.0
if self._telling:
# At the snapshot point, len(dec_buffer) bytes before the read,
@@ -1709,20 +1715,56 @@
# forward until it gives us enough decoded characters.
saved_state = decoder.getstate()
try:
+ # Fast search for an acceptable start point, close to our
+ # current pos.
+ # Rationale: calling decoder.decode() has a large overhead
+ # regardless of chunk size; we want the number of such calls to
+ # be O(1) in most situations (common decoders, non-crazy input).
+ # Actually, it will be exactly 1 for fixed-size codecs (all
+ # 8-bit codecs, also UTF-16 and UTF-32).
+ skip_bytes = int(self._b2cratio * chars_to_skip)
+ skip_back = 1
+ assert skip_bytes <= len(next_input)
+ while skip_bytes > 0:
+ decoder.setstate((b'', dec_flags))
+ # Decode up to temptative start point
+ n = len(decoder.decode(next_input[:skip_bytes]))
+ if n <= chars_to_skip:
+ b, d = decoder.getstate()
+ if not b:
+ # Before pos and no bytes buffered in decoder => OK
+ dec_flags = d
+ chars_to_skip -= n
+ break
+ # Skip back by buffered amount and reset heuristic
+ skip_bytes -= len(b)
+ skip_back = 1
+ else:
+ # We're too far ahead, skip back a bit
+ skip_bytes -= skip_back
+ skip_back = skip_back * 2
+ else:
+ skip_bytes = 0
+ decoder.setstate((b'', dec_flags))
+
# Note our initial start point.
- decoder.setstate((b'', dec_flags))
- start_pos = position
- start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
- need_eof = 0
+ start_pos = position + skip_bytes
+ start_flags = dec_flags
+ if chars_to_skip == 0:
+ # We haven't moved from the start point.
+ return self._pack_cookie(start_pos, start_flags)
# Feed the decoder one byte at a time. As we go, note the
# nearest "safe start point" before the current location
# (a point where the decoder has nothing buffered, so seek()
# can safely start from there and advance to this location).
- next_byte = bytearray(1)
- for next_byte[0] in next_input:
+ bytes_fed = 0
+ need_eof = 0
+ # Chars decoded since `start_pos`
+ chars_decoded = 0
+ for i in range(skip_bytes, len(next_input)):
bytes_fed += 1
- chars_decoded += len(decoder.decode(next_byte))
+ chars_decoded += len(decoder.decode(next_input[i:i+1]))
dec_buffer, dec_flags = decoder.getstate()
if not dec_buffer and chars_decoded <= chars_to_skip:
# Decoder buffer is empty, so this is a safe start point.
diff --git a/Misc/NEWS b/Misc/NEWS
index be3fe10..b0f49c1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -35,6 +35,10 @@
Library
-------
+- Issue #11114: Fix catastrophic performance of tell() on text files (up
+ to 1000x faster in some cases). It is still one to two order of magnitudes
+ slower than binary tell().
+
- Issue 10882: Add os.sendfile function.
- Issue #10868: Allow usage of the register method of an ABC as a class
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index 73d83a1..35bd922 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -678,12 +678,16 @@
PyObject *pending_bytes; /* list of bytes objects waiting to be
written, or NULL */
Py_ssize_t pending_bytes_count;
- PyObject *snapshot;
+
/* snapshot is either None, or a tuple (dec_flags, next_input) where
* dec_flags is the second (integer) item of the decoder state and
* next_input is the chunk of input bytes that comes next after the
* snapshot point. We use this to reconstruct decoder states in tell().
*/
+ PyObject *snapshot;
+ /* Bytes-to-characters ratio for the current chunk. Serves as input for
+ the heuristic in tell(). */
+ double b2cratio;
/* Cache raw object if it's a FileIO object */
PyObject *raw;
@@ -850,6 +854,7 @@
self->decoded_chars_used = 0;
self->pending_bytes_count = 0;
self->encodefunc = NULL;
+ self->b2cratio = 0.0;
if (encoding == NULL) {
/* Try os.device_encoding(fileno) */
@@ -1390,6 +1395,7 @@
PyObject *dec_flags = NULL;
PyObject *input_chunk = NULL;
PyObject *decoded_chars, *chunk_size;
+ Py_ssize_t nbytes, nchars;
int eof;
/* The return value is True unless EOF was reached. The decoded string is
@@ -1435,7 +1441,8 @@
goto fail;
assert(PyBytes_Check(input_chunk));
- eof = (PyBytes_Size(input_chunk) == 0);
+ nbytes = PyBytes_Size(input_chunk);
+ eof = (nbytes == 0);
if (Py_TYPE(self->decoder) == &PyIncrementalNewlineDecoder_Type) {
decoded_chars = _PyIncrementalNewlineDecoder_decode(
@@ -1450,7 +1457,12 @@
if (decoded_chars == NULL)
goto fail;
textiowrapper_set_decoded_chars(self, decoded_chars);
- if (PyUnicode_GET_SIZE(decoded_chars) > 0)
+ nchars = PyUnicode_GET_SIZE(decoded_chars);
+ if (nchars > 0)
+ self->b2cratio = (double) nbytes / nchars;
+ else
+ self->b2cratio = 0.0;
+ if (nchars > 0)
eof = 0;
if (self->telling) {
@@ -2139,8 +2151,12 @@
cookie_type cookie = {0,0,0,0,0};
PyObject *next_input;
Py_ssize_t chars_to_skip, chars_decoded;
+ Py_ssize_t skip_bytes, skip_back;
PyObject *saved_state = NULL;
char *input, *input_end;
+ char *dec_buffer;
+ Py_ssize_t dec_buffer_len;
+ int dec_flags;
CHECK_INITIALIZED(self);
CHECK_CLOSED(self);
@@ -2176,6 +2192,7 @@
#else
cookie.start_pos = PyLong_AsLong(posobj);
#endif
+ Py_DECREF(posobj);
if (PyErr_Occurred())
goto fail;
@@ -2190,57 +2207,99 @@
/* How many decoded characters have been used up since the snapshot? */
if (self->decoded_chars_used == 0) {
/* We haven't moved from the snapshot point. */
- Py_DECREF(posobj);
return textiowrapper_build_cookie(&cookie);
}
chars_to_skip = self->decoded_chars_used;
- /* Starting from the snapshot position, we will walk the decoder
- * forward until it gives us enough decoded characters.
- */
+ /* Decoder state will be restored at the end */
saved_state = PyObject_CallMethodObjArgs(self->decoder,
_PyIO_str_getstate, NULL);
if (saved_state == NULL)
goto fail;
- /* Note our initial start point. */
- if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
- goto fail;
+#define DECODER_GETSTATE() do { \
+ PyObject *_state = PyObject_CallMethodObjArgs(self->decoder, \
+ _PyIO_str_getstate, NULL); \
+ if (_state == NULL) \
+ goto fail; \
+ if (!PyArg_Parse(_state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) { \
+ Py_DECREF(_state); \
+ goto fail; \
+ } \
+ Py_DECREF(_state); \
+ } while (0)
- /* Feed the decoder one byte at a time. As we go, note the
- * nearest "safe start point" before the current location
- * (a point where the decoder has nothing buffered, so seek()
+ /* TODO: replace assert with exception */
+#define DECODER_DECODE(start, len, res) do { \
+ PyObject *_decoded = PyObject_CallMethod( \
+ self->decoder, "decode", "y#", start, len); \
+ if (_decoded == NULL) \
+ goto fail; \
+ assert (PyUnicode_Check(_decoded)); \
+ res = PyUnicode_GET_SIZE(_decoded); \
+ Py_DECREF(_decoded); \
+ } while (0)
+
+ /* Fast search for an acceptable start point, close to our
+ current pos */
+ skip_bytes = (Py_ssize_t) (self->b2cratio * chars_to_skip);
+ skip_back = 1;
+ assert(skip_back <= PyBytes_GET_SIZE(next_input));
+ input = PyBytes_AS_STRING(next_input);
+ while (skip_bytes > 0) {
+ /* Decode up to temptative start point */
+ if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
+ goto fail;
+ DECODER_DECODE(input, skip_bytes, chars_decoded);
+ if (chars_decoded <= chars_to_skip) {
+ DECODER_GETSTATE();
+ if (dec_buffer_len == 0) {
+ /* Before pos and no bytes buffered in decoder => OK */
+ cookie.dec_flags = dec_flags;
+ chars_to_skip -= chars_decoded;
+ break;
+ }
+ /* Skip back by buffered amount and reset heuristic */
+ skip_bytes -= dec_buffer_len;
+ skip_back = 1;
+ }
+ else {
+ /* We're too far ahead, skip back a bit */
+ skip_bytes -= skip_back;
+ skip_back *= 2;
+ }
+ }
+ if (skip_bytes <= 0) {
+ skip_bytes = 0;
+ if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
+ goto fail;
+ }
+
+ /* Note our initial start point. */
+ cookie.start_pos += skip_bytes;
+ cookie.chars_to_skip = chars_to_skip;
+ if (chars_to_skip == 0)
+ goto finally;
+
+ /* We should be close to the desired position. Now feed the decoder one
+ * byte at a time until we reach the `chars_to_skip` target.
+ * As we go, note the nearest "safe start point" before the current
+ * location (a point where the decoder has nothing buffered, so seek()
* can safely start from there and advance to this location).
*/
chars_decoded = 0;
input = PyBytes_AS_STRING(next_input);
input_end = input + PyBytes_GET_SIZE(next_input);
+ input += skip_bytes;
while (input < input_end) {
- PyObject *state;
- char *dec_buffer;
- Py_ssize_t dec_buffer_len;
- int dec_flags;
+ Py_ssize_t n;
- PyObject *decoded = PyObject_CallMethod(
- self->decoder, "decode", "y#", input, 1);
- if (decoded == NULL)
- goto fail;
- assert (PyUnicode_Check(decoded));
- chars_decoded += PyUnicode_GET_SIZE(decoded);
- Py_DECREF(decoded);
-
+ DECODER_DECODE(input, 1, n);
+ /* We got n chars for 1 byte */
+ chars_decoded += n;
cookie.bytes_to_feed += 1;
-
- state = PyObject_CallMethodObjArgs(self->decoder,
- _PyIO_str_getstate, NULL);
- if (state == NULL)
- goto fail;
- if (!PyArg_Parse(state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) {
- Py_DECREF(state);
- goto fail;
- }
- Py_DECREF(state);
+ DECODER_GETSTATE();
if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip) {
/* Decoder buffer is empty, so this is a safe start point. */
@@ -2272,8 +2331,7 @@
}
}
- /* finally */
- Py_XDECREF(posobj);
+finally:
res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state);
Py_DECREF(saved_state);
if (res == NULL)
@@ -2284,8 +2342,7 @@
cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int);
return textiowrapper_build_cookie(&cookie);
- fail:
- Py_XDECREF(posobj);
+fail:
if (saved_state) {
PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback);