Add recursive task scheduling strategy to taskloop implementation
Summary:
Taskloop implementation is extended by using recursive task scheduling.
Envirable KMP_TASKLOOP_MIN_TASKS added as a manual threshold for the user
to switch from recursive to linear tasks scheduling.
Details:
* The calculations for the loop parameters are moved from __kmp_taskloop_linear
upper level
* Initial calculation is done in the __kmpc_taskloop, further range splitting
is done in the __kmp_taskloop_recur.
* Added threshold to switch from recursive to linear tasks scheduling;
* One half of split range is scheduled as an internal task which just moves
sub-range parameters to the stealing thread that continues recursive
scheduling (if number of tasks still enough), the other half is processed
recursively;
* Internal task duplication routine fixed to assign parent task, that was not
needed when all tasks were scheduled by same thread, but is needed now.
Patch by Andrey Churbanov
Differential Revision: https://reviews.llvm.org/D35273
llvm-svn: 308338
diff --git a/openmp/runtime/test/tasking/kmp_taskloop.c b/openmp/runtime/test/tasking/kmp_taskloop.c
index 3b318bd..4b13793 100644
--- a/openmp/runtime/test/tasking/kmp_taskloop.c
+++ b/openmp/runtime/test/tasking/kmp_taskloop.c
@@ -1,4 +1,5 @@
// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
#include <stdio.h>
#include <omp.h>
#include "omp_my_sleep.h"
diff --git a/openmp/runtime/test/tasking/omp_taskloop_grainsize.c b/openmp/runtime/test/tasking/omp_taskloop_grainsize.c
new file mode 100644
index 0000000..ab5001d
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -0,0 +1,106 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ * and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_grainsize()
+{
+ int i, grainsize, count, tmp_count, result, num_off;
+ int *tmp, *tids, *tidsArray;
+
+ tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+ tids = tidsArray;
+
+ for (grainsize = 1; grainsize < 48; ++grainsize) {
+ fprintf(stderr, "Grainsize %d\n", grainsize);
+ count = tmp_count = num_off = 0;
+
+ for (i = 0; i < CFDMAX_SIZE; ++i) {
+ tids[i] = -1;
+ }
+
+ #pragma omp parallel shared(tids)
+ {
+ #pragma omp master
+ #pragma omp taskloop grainsize(grainsize)
+ for (i = 0; i < CFDMAX_SIZE; i++) {
+ tids[i] = omp_get_thread_num();
+ }
+ }
+
+ for (i = 0; i < CFDMAX_SIZE; ++i) {
+ if (tids[i] == -1) {
+ fprintf(stderr, " Iteration %d not touched!\n", i);
+ result++;
+ }
+ }
+
+ for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+ if (tids[i] != tids[i + 1]) {
+ count++;
+ }
+ }
+
+ tmp = (int *)malloc(sizeof(int) * (count + 1));
+ tmp[0] = 1;
+
+ for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+ if (tmp_count > count) {
+ printf("--------------------\nTestinternal Error: List too "
+ "small!!!\n--------------------\n");
+ break;
+ }
+ if (tids[i] != tids[i + 1]) {
+ tmp_count++;
+ tmp[tmp_count] = 1;
+ } else {
+ tmp[tmp_count]++;
+ }
+ }
+
+ // is grainsize statement working?
+ int num_tasks = CFDMAX_SIZE / grainsize;
+ int multiple1 = CFDMAX_SIZE / num_tasks;
+ int multiple2 = CFDMAX_SIZE / num_tasks + 1;
+ for (i = 0; i < count; i++) {
+ // it is possible for 2 adjacent chunks assigned to a same thread
+ if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) {
+ num_off++;
+ }
+ }
+
+ if (num_off > 1) {
+ fprintf(stderr, " The number of bad chunks is %d\n", num_off);
+ result++;
+ } else {
+ fprintf(stderr, " Everything ok\n");
+ }
+
+ free(tmp);
+ }
+ free(tidsArray);
+ return (result==0);
+}
+
+int main()
+{
+ int i;
+ int num_failed=0;
+
+ for (i = 0; i < REPETITIONS; i++) {
+ if (!test_omp_taskloop_grainsize()) {
+ num_failed++;
+ }
+ }
+ return num_failed;
+}
diff --git a/openmp/runtime/test/tasking/omp_taskloop_num_tasks.c b/openmp/runtime/test/tasking/omp_taskloop_num_tasks.c
new file mode 100644
index 0000000..2a932e3
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ * and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_num_tasks()
+{
+ int i;
+ int *tids;
+ int *tidsArray;
+ int count;
+ int result = 0;
+ int num_tasks;
+
+ for (num_tasks = 1; num_tasks < 120; ++num_tasks) {
+ count = 0;
+ tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+ tids = tidsArray;
+
+ #pragma omp parallel shared(tids)
+ {
+ int i;
+ #pragma omp master
+ #pragma omp taskloop num_tasks(num_tasks)
+ for (i = 0; i < CFDMAX_SIZE; i++) {
+ tids[i] = omp_get_thread_num();
+ }
+ }
+
+ for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+ if (tids[i] != tids[i + 1]) {
+ count++;
+ }
+ }
+
+ if (count > num_tasks) {
+ fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n",
+ num_tasks, count);
+ result++;
+ }
+ }
+
+ return (result==0);
+}
+
+int main()
+{
+ int i;
+ int num_failed=0;
+
+ for (i = 0; i < REPETITIONS; i++) {
+ if (!test_omp_taskloop_num_tasks()) {
+ num_failed++;
+ }
+ }
+ return num_failed;
+}