Faster implementation for NULL thread pool
diff --git a/src/pthreadpool.c b/src/pthreadpool.c
index 6573fc3..704eb73 100644
--- a/src/pthreadpool.c
+++ b/src/pthreadpool.c
@@ -336,14 +336,22 @@
 	size_t range,
 	size_t tile)
 {
-	const size_t tile_range = divide_round_up(range, tile);
-	struct compute_1d_tiled_context context = {
-		.function = function,
-		.argument = argument,
-		.range = range,
-		.tile = tile
-	};
-	pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
+	if (threadpool == NULL) {
+		/* No thread pool provided: execute function sequentially on the calling thread */
+		for (size_t i = 0; i < range; i += tile) {
+			function(argument, i, min(range - i, tile));
+		}
+	} else {
+		/* Execute in parallel on the thread pool using linearized index */
+		const size_t tile_range = divide_round_up(range, tile);
+		struct compute_1d_tiled_context context = {
+			.function = function,
+			.argument = argument,
+			.range = range,
+			.tile = tile
+		};
+		pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
+	}
 }
 
 struct compute_2d_context {
@@ -365,12 +373,22 @@
 	size_t range_i,
 	size_t range_j)
 {
-	struct compute_2d_context context = {
-		.function = function,
-		.argument = argument,
-		.range_j = fxdiv_init_size_t(range_j)
-	};
-	pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
+	if (threadpool == NULL) {
+		/* No thread pool provided: execute function sequentially on the calling thread */
+		for (size_t i = 0; i < range_i; i++) {
+			for (size_t j = 0; j < range_j; j++) {
+				function(argument, i, j);
+			}
+		}
+	} else {
+		/* Execute in parallel on the thread pool using linearized index */
+		struct compute_2d_context context = {
+			.function = function,
+			.argument = argument,
+			.range_j = fxdiv_init_size_t(range_j)
+		};
+		pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
+	}
 }
 
 struct compute_2d_tiled_context {
@@ -404,18 +422,28 @@
 	size_t tile_i,
 	size_t tile_j)
 {
-	const size_t tile_range_i = divide_round_up(range_i, tile_i);
-	const size_t tile_range_j = divide_round_up(range_j, tile_j);
-	struct compute_2d_tiled_context context = {
-		.function = function,
-		.argument = argument,
-		.tile_range_j = fxdiv_init_size_t(tile_range_j),
-		.range_i = range_i,
-		.range_j = range_j,
-		.tile_i = tile_i,
-		.tile_j = tile_j
-	};
-	pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
+	if (threadpool == NULL) {
+		/* No thread pool provided: execute function sequentially on the calling thread */
+		for (size_t i = 0; i < range_i; i += tile_i) {
+			for (size_t j = 0; j < range_j; j += tile_j) {
+				function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
+			}
+		}
+	} else {
+		/* Execute in parallel on the thread pool using linearized index */
+		const size_t tile_range_i = divide_round_up(range_i, tile_i);
+		const size_t tile_range_j = divide_round_up(range_j, tile_j);
+		struct compute_2d_tiled_context context = {
+			.function = function,
+			.argument = argument,
+			.tile_range_j = fxdiv_init_size_t(tile_range_j),
+			.range_i = range_i,
+			.range_j = range_j,
+			.tile_i = tile_i,
+			.tile_j = tile_j
+		};
+		pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
+	}
 }
 
 void pthreadpool_destroy(struct pthreadpool* threadpool) {