Switching XPath node sorting to Timsort
I use libxml xpath engine on quite large (and mostly "flat") xml files.
It seems that Shellsort, that is used in xmlXPathNodeSetSort is a
performance bottleneck for my case. I have read some posts about sorting
in libxml in the libxml archive, but I agree that qsort was not the way
to go. I experimented with Timsort instead and my results were good for
me. For about 10000 nodes, my test was about 5x faster with Timsort,
for 1000 nodes about 10% faster, for small data files, the difference
was not measurable.
* timsort.h: the algorithm, kept in a separate header
* xpath.c: plug in the new algorithm in xmlXPathNodeSetSort
* Makefile.am: add the header to the EXTRA_DIST
* doc/apibuild.py: avoid indexing the new header
diff --git a/xpath.c b/xpath.c
index df0e367..89da0c8 100644
--- a/xpath.c
+++ b/xpath.c
@@ -66,6 +66,15 @@
"Unimplemented block at %s:%d\n", \
__FILE__, __LINE__);
+/**
+ * WITH_TIM_SORT:
+ *
+ * Use the Timsort algorithm provided in timsort.h to sort
+ * nodeset as this is a great improvement over the old Shell sort
+ * used in xmlXPathNodeSetSort()
+ */
+#define WITH_TIM_SORT
+
/*
* XP_OPTIMIZED_NON_ELEM_COMPARISON:
* If defined, this will use xmlXPathCmpNodesExt() instead of
@@ -126,6 +135,41 @@
* any use of the macros IS_ASCII_CHARACTER and IS_ASCII_DIGIT)
*/
+/*
+ * Wrapper for the Timsort argorithm from timsort.h
+ */
+#ifdef WITH_TIM_SORT
+#define SORT_NAME libxml_domnode
+#define SORT_TYPE xmlNodePtr
+/**
+ * wrap_cmp:
+ * @x: a node
+ * @y: another node
+ *
+ * Comparison function for the Timsort implementation
+ *
+ * Returns -2 or +2 based on the order
+ */
+static
+int wrap_cmp( xmlNodePtr x, xmlNodePtr y );
+#ifdef XP_OPTIMIZED_NON_ELEM_COMPARISON
+ static int xmlXPathCmpNodesExt(xmlNodePtr, xmlNodePtr);
+ static int wrap_cmp( xmlNodePtr x, xmlNodePtr y )
+ {
+ int res = xmlXPathCmpNodesExt(x, y);
+ return res == -2 ? res : -res;
+ }
+#else
+ static int wrap_cmp( xmlNodePtr x, xmlNodePtr y )
+ {
+ int res = xmlXPathCmpNodes(x, y);
+ return res == -2 ? res : -res;
+ }
+#endif
+#define SORT_CMP(x, y) (wrap_cmp(x, y))
+#include "timsort.h"
+#endif /* WITH_TIM_SORT */
+
#if defined(LIBXML_XPATH_ENABLED) || defined(LIBXML_SCHEMAS_ENABLED)
/************************************************************************
@@ -3359,13 +3403,19 @@
*/
void
xmlXPathNodeSetSort(xmlNodeSetPtr set) {
+#ifndef WITH_TIM_SORT
int i, j, incr, len;
xmlNodePtr tmp;
+#endif
if (set == NULL)
return;
- /* Use Shell's sort to sort the node-set */
+#ifndef WITH_TIM_SORT
+ /*
+ * Use the old Shell's sort implementation to sort the node-set
+ * Timsort ought to be quite faster
+ */
len = set->nodeNr;
for (incr = len / 2; incr > 0; incr /= 2) {
for (i = incr; i < len; i++) {
@@ -3388,6 +3438,9 @@
}
}
}
+#else /* WITH_TIM_SORT */
+ libxml_domnode_tim_sort(set->nodeTab, set->nodeNr);
+#endif /* WITH_TIM_SORT */
}
#define XML_NODESET_DEFAULT 10