Improve the implementation of itertools.tee().
Formerly, underlying queue was implemented in terms of two lists. The
new queue is a series of singly-linked fixed length lists.
The new implementation runs much faster, supports multi-way tees, and
allows tees of tees without additional memory costs.
The root ideas for this structure were contributed by Andrew Koenig
and Guido van Rossum.
diff --git a/Modules/itertoolsmodule.c b/Modules/itertoolsmodule.c
index be2d735..a341a66 100644
--- a/Modules/itertoolsmodule.c
+++ b/Modules/itertoolsmodule.c
@@ -7,122 +7,102 @@
All rights reserved.
*/
-/* independent iterator object supporting the tee object ***************/
+/* tee object and with supporting function and objects ***************/
-/* The tee object maintains a queue of data seen by the leading iterator
- but not seen by the trailing iterator. When the leading iterator
- gets data from PyIter_Next() it appends a copy to the inbasket stack.
- When the trailing iterator needs data, it is popped from the outbasket
- stack. If the outbasket stack is empty, then it is filled from the
- inbasket (i.e. the queue is implemented using two stacks so that only
- O(n) operations like append() and pop() are used to access data and
- calls to reverse() never move any data element more than once).
-
- If one of the independent iterators gets deallocated, it sets tee's
- save_mode to zero so that future calls to PyIter_Next() stop getting
- saved to the queue (because there is no longer a second iterator that
- may need the data).
+/* The teedataobject pre-allocates space for LINKCELLS number of objects.
+ To help the object fit neatly inside cache lines (space for 16 to 32
+ pointers), the value should be a multiple of 16 minus space for
+ the other structure members including PyHEAD overhead. The larger the
+ value, the less memory overhead per object and the less time spent
+ allocating/deallocating new links. The smaller the number, the less
+ wasted space and the more rapid freeing of older data.
*/
+#define LINKCELLS 57
typedef struct {
PyObject_HEAD
PyObject *it;
- PyObject *inbasket;
- PyObject *outbasket;
- int save_mode;
- int num_seen;
-} teeobject;
+ int numread;
+ PyObject *nextlink;
+ PyObject *(values[LINKCELLS]);
+} teedataobject;
typedef struct {
PyObject_HEAD
- teeobject *tee;
- int num_seen;
-} iiobject;
+ teedataobject *dataobj;
+ int index;
+} teeobject;
-static PyTypeObject ii_type;
+static PyTypeObject teedataobject_type;
static PyObject *
-ii_next(iiobject *lz)
+teedataobject_new(PyObject *it)
{
- teeobject *to = lz->tee;
- PyObject *result, *tmp;
- int n;
+ teedataobject *tdo;
- if (lz->num_seen == to->num_seen) {
- /* This instance is leading, use iter to get more data */
- result = PyIter_Next(to->it);
- if (result == NULL)
- return NULL;
- if (to->save_mode)
- if(PyList_Append(to->inbasket, result) == -1){
- Py_DECREF(result);
- return NULL;
- }
- to->num_seen++;
- lz->num_seen++;
- return result;
- }
-
- /* This instance is trailing, get data from the queue */
- if (PyList_GET_SIZE(to->outbasket) == 0) {
- /* outbasket is empty, so refill from the inbasket */
- tmp = to->outbasket;
- to->outbasket = to->inbasket;
- to->inbasket = tmp;
- PyList_Reverse(to->outbasket);
- }
-
- /* Pop result from to->outbasket */
- n = PyList_GET_SIZE(to->outbasket);
- assert(n>0);
- result = PyList_GET_ITEM(to->outbasket, n-1);
- Py_INCREF(result);
- if (PyList_SetSlice(to->outbasket, n-1, n, NULL) == -1) {
- Py_DECREF(result);
+ tdo = PyObject_New(teedataobject, &teedataobject_type);
+ if (tdo == NULL)
return NULL;
+
+ tdo->numread = 0;
+ tdo->nextlink = NULL;
+ Py_INCREF(it);
+ tdo->it = it;
+ return (PyObject *)tdo;
+}
+
+static PyObject *
+teedataobject_jumplink(teedataobject *tdo)
+{
+ if (tdo->nextlink == NULL)
+ tdo->nextlink = teedataobject_new(tdo->it);
+ Py_INCREF(tdo->nextlink);
+ return tdo->nextlink;
+}
+
+static PyObject *
+teedataobject_getitem(teedataobject *tdo, int i)
+{
+ PyObject *value;
+
+ assert(i < LINKCELLS);
+ if (i < tdo->numread)
+ value = tdo->values[i];
+ else {
+ /* this is the lead iterator, so fetch more data */
+ assert(i == tdo->numread);
+ value = PyIter_Next(tdo->it);
+ if (value == NULL)
+ return NULL;
+ tdo->numread++;
+ tdo->values[i] = value;
}
- lz->num_seen++;
- return result;
+ Py_INCREF(value);
+ return value;
}
static void
-ii_dealloc(iiobject *ii)
+teedataobject_dealloc(teedataobject *tdo)
{
- teeobject *to = ii->tee;
- int n;
+ int i;
- PyObject_GC_UnTrack(ii);
- ii->tee->save_mode = 0; /* Stop saving data */
- if (ii->num_seen < to->num_seen) {
- /* It is the trailing iterator being freed; this means
- that the stored queue data is no longer needed */
- n = PyList_GET_SIZE(to->inbasket);
- PyList_SetSlice(to->inbasket, 0, n, NULL);
- n = PyList_GET_SIZE(to->outbasket);
- PyList_SetSlice(to->outbasket, 0, n, NULL);
- }
- Py_XDECREF(ii->tee);
- PyObject_GC_Del(ii);
+ for (i=0 ; i<tdo->numread ; i++)
+ Py_DECREF(tdo->values[i]);
+ Py_XDECREF(tdo->it);
+ Py_XDECREF(tdo->nextlink);
+ PyObject_Del(tdo);
}
-static int
-ii_traverse(iiobject *ii, visitproc visit, void *arg)
-{
- if (ii->tee)
- return visit((PyObject *)(ii->tee), arg);
- return 0;
-}
+PyDoc_STRVAR(teedataobject_doc, "Data container common to multiple tee objects.");
-PyDoc_STRVAR(ii_doc, "Independent iterator created by tee(iterable).");
-
-static PyTypeObject ii_type = {
+static PyTypeObject teedataobject_type = {
PyObject_HEAD_INIT(&PyType_Type)
0, /* ob_size */
- "itertools.tee_iterator", /* tp_name */
- sizeof(iiobject), /* tp_basicsize */
+ "itertools.tee_dataobject", /* tp_name */
+ sizeof(teedataobject), /* tp_basicsize */
0, /* tp_itemsize */
/* methods */
- (destructor)ii_dealloc, /* tp_dealloc */
+ (destructor)teedataobject_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
@@ -137,112 +117,96 @@
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
- ii_doc, /* tp_doc */
- (traverseproc)ii_traverse, /* tp_traverse */
- 0, /* tp_clear */
- 0, /* tp_richcompare */
- 0, /* tp_weaklistoffset */
- PyObject_SelfIter, /* tp_iter */
- (iternextfunc)ii_next, /* tp_iternext */
- 0, /* tp_methods */
+ Py_TPFLAGS_DEFAULT, /* tp_flags */
+ teedataobject_doc, /* tp_doc */
+ 0, /* tp_traverse */
};
-/* tee object **********************************************************/
static PyTypeObject tee_type;
static PyObject *
-tee_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+tee_next(teeobject *to)
{
+ PyObject *value, *link;
+
+ if (to->index >= LINKCELLS) {
+ link = teedataobject_jumplink(to->dataobj);
+ Py_XDECREF(to->dataobj);
+ to->dataobj = (teedataobject *)link;
+ to->index = 0;
+ }
+ value = teedataobject_getitem(to->dataobj, to->index);
+ if (value == NULL)
+ return NULL;
+ to->index++;
+ return value;
+}
+
+static PyObject *
+tee_copy(teeobject *to)
+{
+ teeobject *newto;
+
+ newto = PyObject_New(teeobject, &tee_type);
+ if (newto == NULL)
+ return NULL;
+ Py_INCREF(to->dataobj);
+ newto->dataobj = to->dataobj;
+ newto->index = to->index;
+ return (PyObject *)newto;
+}
+
+PyDoc_STRVAR(teecopy_doc, "Returns an independent iterator.");
+
+static PyObject *
+tee_fromiterable(PyObject *iterable)
+{
+ teeobject *to;
PyObject *it = NULL;
+
+ it = PyObject_GetIter(iterable);
+ if (it == NULL)
+ return NULL;
+ if (PyObject_TypeCheck(it, &tee_type)) {
+ to = (teeobject *)tee_copy((teeobject *)it);
+ goto done;
+ }
+
+ to = PyObject_New(teeobject, &tee_type);
+ if (to == NULL)
+ goto done;
+ to->dataobj = (teedataobject *)teedataobject_new(it);
+ to->index = 0;
+done:
+ Py_XDECREF(it);
+ return (PyObject *)to;
+}
+
+static PyObject *
+tee_new(PyTypeObject *type, PyObject *args, PyObject *kw)
+{
PyObject *iterable;
- PyObject *inbasket = NULL, *outbasket = NULL, *result = NULL;
- teeobject *to = NULL;
- int i;
if (!PyArg_UnpackTuple(args, "tee", 1, 1, &iterable))
return NULL;
-
- it = PyObject_GetIter(iterable);
- if (it == NULL) goto fail;
-
- inbasket = PyList_New(0);
- if (inbasket == NULL) goto fail;
-
- outbasket = PyList_New(0);
- if (outbasket == NULL) goto fail;
-
- to = PyObject_GC_New(teeobject, &tee_type);
- if (to == NULL) goto fail;
-
- to->it = it;
- to->inbasket = inbasket;
- to->outbasket = outbasket;
- to->save_mode = 1;
- to->num_seen = 0;
- PyObject_GC_Track(to);
-
- /* create independent iterators */
- result = PyTuple_New(2);
- if (result == NULL) goto fail;
- for (i=0 ; i<2 ; i++) {
- iiobject *indep_it = PyObject_GC_New(iiobject, &ii_type);
- if (indep_it == NULL) goto fail;
- Py_INCREF(to);
- indep_it->tee = to;
- indep_it->num_seen = 0;
- PyObject_GC_Track(indep_it);
- PyTuple_SET_ITEM(result, i, (PyObject *)indep_it);
- }
- goto succeed;
-fail:
- Py_XDECREF(it);
- Py_XDECREF(inbasket);
- Py_XDECREF(outbasket);
- Py_XDECREF(result);
-succeed:
- Py_XDECREF(to);
- return result;
+ return tee_fromiterable(iterable);
}
static void
tee_dealloc(teeobject *to)
{
- PyObject_GC_UnTrack(to);
- Py_XDECREF(to->inbasket);
- Py_XDECREF(to->outbasket);
- Py_XDECREF(to->it);
- PyObject_GC_Del(to);
+ Py_XDECREF(to->dataobj);
+ PyObject_Del(to);
}
-static int
-tee_traverse(teeobject *to, visitproc visit, void *arg)
-{
- int err;
+PyDoc_STRVAR(teeobject_doc,
+"Iterator wrapped to make it copyable");
- if (to->it) {
- err = visit(to->it, arg);
- if (err)
- return err;
- }
- if (to->inbasket) {
- err = visit(to->inbasket, arg);
- if (err)
- return err;
- }
- if (to->outbasket) {
- err = visit(to->outbasket, arg);
- if (err)
- return err;
- }
- return 0;
-}
-
-PyDoc_STRVAR(tee_doc,
-"tee(iterable) --> (it1, it2)\n\
-\n\
-Split the iterable into two independent iterables.");
+static PyMethodDef tee_methods[] = {
+ {"__copy__", (PyCFunction)tee_copy, METH_NOARGS, teecopy_doc},
+ {NULL, NULL} /* sentinel */
+};
static PyTypeObject tee_type = {
PyObject_HEAD_INIT(NULL)
@@ -266,15 +230,15 @@
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_flags */
- tee_doc, /* tp_doc */
- (traverseproc)tee_traverse, /* tp_traverse */
+ Py_TPFLAGS_DEFAULT, /* tp_flags */
+ teeobject_doc, /* tp_doc */
+ 0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
- 0, /* tp_iter */
- 0, /* tp_iternext */
- 0, /* tp_methods */
+ PyObject_SelfIter, /* tp_iter */
+ (iternextfunc)tee_next, /* tp_iternext */
+ tee_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
@@ -285,8 +249,52 @@
0, /* tp_init */
0, /* tp_alloc */
tee_new, /* tp_new */
+ PyObject_Del, /* tp_free */
};
+static PyObject *
+tee(PyObject *self, PyObject *args)
+{
+ int i, n=2;
+ PyObject *it, *iterable, *copyable, *result;
+
+ if (!PyArg_ParseTuple(args, "O|i", &iterable, &n))
+ return NULL;
+ result = PyTuple_New(n);
+ if (result == NULL)
+ return NULL;
+ if (n == 0)
+ return result;
+ it = PyObject_GetIter(iterable);
+ if (it == NULL) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ if (!PyObject_HasAttrString(it, "__copy__")) {
+ copyable = tee_fromiterable(it);
+ Py_DECREF(it);
+ if (copyable == NULL) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ } else
+ copyable = it;
+ PyTuple_SET_ITEM(result, 0, copyable);
+ for (i=1 ; i<n ; i++) {
+ copyable = PyObject_CallMethod(copyable, "__copy__", NULL);
+ if (copyable == NULL) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ PyTuple_SET_ITEM(result, i, copyable);
+ }
+ return result;
+}
+
+PyDoc_STRVAR(tee_doc,
+"tee(iterable, n=2) --> tuple of n independent iterators.");
+
+
/* cycle object **********************************************************/
typedef struct {
@@ -2091,13 +2099,18 @@
seq[start:stop:step]\n\
imap(fun, p, q, ...) --> fun(p0, q0), fun(p1, q1), ...\n\
starmap(fun, seq) --> fun(*seq[0]), fun(*seq[1]), ...\n\
-tee(it) --> (it1, it2) splits one iterator into two \n\
+tee(it, n=2) --> (it1, it2 , ... itn) splits one iterator into n\n\
chain(p, q, ...) --> p0, p1, ... plast, q0, q1, ... \n\
takewhile(pred, seq) --> seq[0], seq[1], until pred fails\n\
dropwhile(pred, seq) --> seq[n], seq[n+1], starting when pred fails\n\
");
+static PyMethodDef module_methods[] = {
+ {"tee", (PyCFunction)tee, METH_VARARGS, tee_doc},
+ {NULL, NULL} /* sentinel */
+};
+
PyMODINIT_FUNC
inititertools(void)
{
@@ -2105,7 +2118,6 @@
PyObject *m;
char *name;
PyTypeObject *typelist[] = {
- &tee_type,
&cycle_type,
&dropwhile_type,
&takewhile_type,
@@ -2121,7 +2133,7 @@
NULL
};
- m = Py_InitModule3("itertools", NULL, module_doc);
+ m = Py_InitModule3("itertools", module_methods, module_doc);
for (i=0 ; typelist[i] != NULL ; i++) {
if (PyType_Ready(typelist[i]) < 0)
@@ -2131,4 +2143,10 @@
Py_INCREF(typelist[i]);
PyModule_AddObject(m, name+1, (PyObject *)typelist[i]);
}
+
+ if (PyType_Ready(&teedataobject_type) < 0)
+ return;
+ if (PyType_Ready(&tee_type) < 0)
+ return;
+
}