16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
21 #include "ompt-specific.h"
24 #include "tsan_annotations.h"
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28 kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30 kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32 kmp_task_team_t *task_team);
35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
38 #ifdef BUILD_TIED_TASK_STACK
47 static void __kmp_trace_task_stack(kmp_int32 gtid,
48 kmp_thread_data_t *thread_data,
49 int threshold,
char *location) {
50 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
51 kmp_taskdata_t **stack_top = task_stack->ts_top;
52 kmp_int32 entries = task_stack->ts_entries;
53 kmp_taskdata_t *tied_task;
57 (
"__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
58 "first_block = %p, stack_top = %p \n",
59 location, gtid, entries, task_stack->ts_first_block, stack_top));
61 KMP_DEBUG_ASSERT(stack_top != NULL);
62 KMP_DEBUG_ASSERT(entries > 0);
64 while (entries != 0) {
65 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
67 if (entries & TASK_STACK_INDEX_MASK == 0) {
68 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
70 stack_block = stack_block->sb_prev;
71 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
78 tied_task = *stack_top;
80 KMP_DEBUG_ASSERT(tied_task != NULL);
81 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
84 (
"__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
85 "stack_top=%p, tied_task=%p\n",
86 location, gtid, entries, stack_top, tied_task));
88 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
91 (
"__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
101 static void __kmp_init_task_stack(kmp_int32 gtid,
102 kmp_thread_data_t *thread_data) {
103 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
104 kmp_stack_block_t *first_block;
107 first_block = &task_stack->ts_first_block;
108 task_stack->ts_top = (kmp_taskdata_t **)first_block;
109 memset((
void *)first_block,
'\0',
110 TASK_STACK_BLOCK_SIZE *
sizeof(kmp_taskdata_t *));
113 task_stack->ts_entries = TASK_STACK_EMPTY;
114 first_block->sb_next = NULL;
115 first_block->sb_prev = NULL;
122 static void __kmp_free_task_stack(kmp_int32 gtid,
123 kmp_thread_data_t *thread_data) {
124 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
125 kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
127 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
129 while (stack_block != NULL) {
130 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
132 stack_block->sb_next = NULL;
133 stack_block->sb_prev = NULL;
134 if (stack_block != &task_stack->ts_first_block) {
135 __kmp_thread_free(thread,
138 stack_block = next_block;
141 task_stack->ts_entries = 0;
142 task_stack->ts_top = NULL;
151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
152 kmp_taskdata_t *tied_task) {
154 kmp_thread_data_t *thread_data =
155 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
156 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
158 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
162 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
163 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
166 (
"__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
167 gtid, thread, tied_task));
169 *(task_stack->ts_top) = tied_task;
172 task_stack->ts_top++;
173 task_stack->ts_entries++;
175 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
177 kmp_stack_block_t *stack_block =
178 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
181 if (stack_block->sb_next !=
183 task_stack->ts_top = &stack_block->sb_next->sb_block[0];
185 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
186 thread,
sizeof(kmp_stack_block_t));
188 task_stack->ts_top = &new_block->sb_block[0];
189 stack_block->sb_next = new_block;
190 new_block->sb_prev = stack_block;
191 new_block->sb_next = NULL;
195 (
"__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
196 gtid, tied_task, new_block));
199 KA_TRACE(20, (
"__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
211 kmp_taskdata_t *ending_task) {
213 kmp_thread_data_t *thread_data =
214 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
215 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
216 kmp_taskdata_t *tied_task;
218 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
223 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
224 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
226 KA_TRACE(20, (
"__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
230 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
231 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
233 stack_block = stack_block->sb_prev;
234 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
238 task_stack->ts_top--;
239 task_stack->ts_entries--;
241 tied_task = *(task_stack->ts_top);
243 KMP_DEBUG_ASSERT(tied_task != NULL);
244 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
245 KMP_DEBUG_ASSERT(tied_task == ending_task);
247 KA_TRACE(20, (
"__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
256 static bool __kmp_task_is_allowed(
int gtid,
const kmp_int32 is_constrained,
257 const kmp_taskdata_t *tasknew,
258 const kmp_taskdata_t *taskcurr) {
259 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
263 kmp_taskdata_t *current = taskcurr->td_last_tied;
264 KMP_DEBUG_ASSERT(current != NULL);
266 if (current->td_flags.tasktype == TASK_EXPLICIT ||
267 current->td_taskwait_thread > 0) {
268 kmp_int32 level = current->td_level;
269 kmp_taskdata_t *parent = tasknew->td_parent;
270 while (parent != current && parent->td_level > level) {
272 parent = parent->td_parent;
273 KMP_DEBUG_ASSERT(parent != NULL);
275 if (parent != current)
280 kmp_depnode_t *node = tasknew->td_depnode;
281 if (node && (node->dn.mtx_num_locks > 0)) {
282 for (
int i = 0; i < node->dn.mtx_num_locks; ++i) {
283 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
284 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
287 for (
int j = i - 1; j >= 0; --j)
288 __kmp_release_lock(node->dn.mtx_locks[j], gtid);
292 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
301 static void __kmp_realloc_task_deque(kmp_info_t *thread,
302 kmp_thread_data_t *thread_data) {
303 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
304 kmp_int32 new_size = 2 * size;
306 KE_TRACE(10, (
"__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
307 "%d] for thread_data %p\n",
308 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
310 kmp_taskdata_t **new_deque =
311 (kmp_taskdata_t **)__kmp_allocate(new_size *
sizeof(kmp_taskdata_t *));
314 for (i = thread_data->td.td_deque_head, j = 0; j < size;
315 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
316 new_deque[j] = thread_data->td.td_deque[i];
318 __kmp_free(thread_data->td.td_deque);
320 thread_data->td.td_deque_head = 0;
321 thread_data->td.td_deque_tail = size;
322 thread_data->td.td_deque = new_deque;
323 thread_data->td.td_deque_size = new_size;
327 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
328 kmp_info_t *thread = __kmp_threads[gtid];
329 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
330 kmp_task_team_t *task_team = thread->th.th_task_team;
331 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
332 kmp_thread_data_t *thread_data;
335 (
"__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
337 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
340 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
341 KMP_DEBUG_USE_VAR(counter);
344 (
"__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
345 gtid, counter, taskdata));
349 if (taskdata->td_flags.task_serial) {
350 KA_TRACE(20, (
"__kmp_push_task: T#%d team serialized; returning "
351 "TASK_NOT_PUSHED for task %p\n",
353 return TASK_NOT_PUSHED;
358 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
359 if (!KMP_TASKING_ENABLED(task_team)) {
360 __kmp_enable_tasking(task_team, thread);
362 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
363 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
366 thread_data = &task_team->tt.tt_threads_data[tid];
369 if (thread_data->td.td_deque == NULL) {
370 __kmp_alloc_task_deque(thread, thread_data);
375 if (TCR_4(thread_data->td.td_deque_ntasks) >=
376 TASK_DEQUE_SIZE(thread_data->td)) {
377 if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
378 thread->th.th_current_task)) {
379 KA_TRACE(20, (
"__kmp_push_task: T#%d deque is full; returning "
380 "TASK_NOT_PUSHED for task %p\n",
382 return TASK_NOT_PUSHED;
384 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
387 __kmp_realloc_task_deque(thread, thread_data);
392 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
395 if (TCR_4(thread_data->td.td_deque_ntasks) >=
396 TASK_DEQUE_SIZE(thread_data->td)) {
397 if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
398 thread->th.th_current_task)) {
399 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
400 KA_TRACE(20, (
"__kmp_push_task: T#%d deque is full on 2nd check; "
401 "returning TASK_NOT_PUSHED for task %p\n",
403 return TASK_NOT_PUSHED;
406 __kmp_realloc_task_deque(thread, thread_data);
412 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
413 TASK_DEQUE_SIZE(thread_data->td));
415 thread_data->td.td_deque[thread_data->td.td_deque_tail] =
418 thread_data->td.td_deque_tail =
419 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
420 TCW_4(thread_data->td.td_deque_ntasks,
421 TCR_4(thread_data->td.td_deque_ntasks) + 1);
423 KA_TRACE(20, (
"__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
424 "task=%p ntasks=%d head=%u tail=%u\n",
425 gtid, taskdata, thread_data->td.td_deque_ntasks,
426 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
428 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
430 return TASK_SUCCESSFULLY_PUSHED;
437 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
438 KF_TRACE(10, (
"__kmp_pop_current_task_from_thread(enter): T#%d "
439 "this_thread=%p, curtask=%p, "
440 "curtask_parent=%p\n",
441 0, this_thr, this_thr->th.th_current_task,
442 this_thr->th.th_current_task->td_parent));
444 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
446 KF_TRACE(10, (
"__kmp_pop_current_task_from_thread(exit): T#%d "
447 "this_thread=%p, curtask=%p, "
448 "curtask_parent=%p\n",
449 0, this_thr, this_thr->th.th_current_task,
450 this_thr->th.th_current_task->td_parent));
459 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
463 KF_TRACE(10, (
"__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
466 tid, this_thr, this_thr->th.th_current_task,
467 team->t.t_implicit_task_taskdata[tid].td_parent));
469 KMP_DEBUG_ASSERT(this_thr != NULL);
472 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
473 team->t.t_implicit_task_taskdata[0].td_parent =
474 this_thr->th.th_current_task;
475 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
478 team->t.t_implicit_task_taskdata[tid].td_parent =
479 team->t.t_implicit_task_taskdata[0].td_parent;
480 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
483 KF_TRACE(10, (
"__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
486 tid, this_thr, this_thr->th.th_current_task,
487 team->t.t_implicit_task_taskdata[tid].td_parent));
495 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
496 kmp_taskdata_t *current_task) {
497 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
498 kmp_info_t *thread = __kmp_threads[gtid];
501 (
"__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
502 gtid, taskdata, current_task));
504 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
509 current_task->td_flags.executing = 0;
512 #ifdef BUILD_TIED_TASK_STACK
513 if (taskdata->td_flags.tiedness == TASK_TIED) {
514 __kmp_push_task_stack(gtid, thread, taskdata);
519 thread->th.th_current_task = taskdata;
521 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
522 taskdata->td_flags.tiedness == TASK_UNTIED);
523 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
524 taskdata->td_flags.tiedness == TASK_UNTIED);
525 taskdata->td_flags.started = 1;
526 taskdata->td_flags.executing = 1;
527 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
528 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
535 KA_TRACE(10, (
"__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
546 static inline void __ompt_task_init(kmp_taskdata_t *task,
int tid) {
548 task->ompt_task_info.task_data.value = 0;
549 task->ompt_task_info.frame.exit_frame = ompt_data_none;
550 task->ompt_task_info.frame.enter_frame = ompt_data_none;
551 task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
552 task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
554 task->ompt_task_info.ndeps = 0;
555 task->ompt_task_info.deps = NULL;
561 static inline void __ompt_task_start(kmp_task_t *task,
562 kmp_taskdata_t *current_task,
564 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
565 ompt_task_status_t status = ompt_task_switch;
566 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
567 status = ompt_task_yield;
568 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
571 if (ompt_enabled.ompt_callback_task_schedule) {
572 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
573 &(current_task->ompt_task_info.task_data), status,
574 &(taskdata->ompt_task_info.task_data));
576 taskdata->ompt_task_info.scheduling_parent = current_task;
582 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
583 ompt_task_status_t status = ompt_task_complete) {
584 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
585 if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
586 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
587 status = ompt_task_cancel;
591 if (ompt_enabled.ompt_callback_task_schedule) {
592 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
593 &(taskdata->ompt_task_info.task_data), status,
594 &((resumed_task ? resumed_task
595 : (taskdata->ompt_task_info.scheduling_parent
596 ? taskdata->ompt_task_info.scheduling_parent
597 : taskdata->td_parent))
598 ->ompt_task_info.task_data));
604 static void __kmpc_omp_task_begin_if0_template(
ident_t *loc_ref, kmp_int32 gtid,
607 void *return_address) {
608 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
609 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
611 KA_TRACE(10, (
"__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
613 gtid, loc_ref, taskdata, current_task));
615 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
618 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
619 KMP_DEBUG_USE_VAR(counter);
620 KA_TRACE(20, (
"__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
621 "incremented for task %p\n",
622 gtid, counter, taskdata));
625 taskdata->td_flags.task_serial =
627 __kmp_task_start(gtid, task, current_task);
631 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
632 current_task->ompt_task_info.frame.enter_frame.ptr =
633 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
634 current_task->ompt_task_info.frame.enter_frame_flags =
635 taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
637 if (ompt_enabled.ompt_callback_task_create) {
638 ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
639 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
640 &(parent_info->task_data), &(parent_info->frame),
641 &(taskdata->ompt_task_info.task_data),
642 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
645 __ompt_task_start(task, current_task, gtid);
647 #endif // OMPT_SUPPORT
649 KA_TRACE(10, (
"__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
655 static void __kmpc_omp_task_begin_if0_ompt(
ident_t *loc_ref, kmp_int32 gtid,
658 void *return_address) {
659 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
662 #endif // OMPT_SUPPORT
670 void __kmpc_omp_task_begin_if0(
ident_t *loc_ref, kmp_int32 gtid,
673 if (UNLIKELY(ompt_enabled.enabled)) {
674 OMPT_STORE_RETURN_ADDRESS(gtid);
675 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
676 OMPT_GET_FRAME_ADDRESS(1),
677 OMPT_LOAD_RETURN_ADDRESS(gtid));
681 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
687 void __kmpc_omp_task_begin(
ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
688 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
692 (
"__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
693 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
695 __kmp_task_start(gtid, task, current_task);
697 KA_TRACE(10, (
"__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
698 loc_ref, KMP_TASK_TO_TASKDATA(task)));
701 #endif // TASK_UNUSED
708 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
709 kmp_info_t *thread) {
710 KA_TRACE(30, (
"__kmp_free_task: T#%d freeing data from task %p\n", gtid,
714 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
715 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
716 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
717 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
718 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
719 taskdata->td_flags.task_serial == 1);
720 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
722 taskdata->td_flags.freed = 1;
723 ANNOTATE_HAPPENS_BEFORE(taskdata);
726 __kmp_fast_free(thread, taskdata);
728 __kmp_thread_free(thread, taskdata);
731 KA_TRACE(20, (
"__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
740 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
741 kmp_taskdata_t *taskdata,
742 kmp_info_t *thread) {
746 kmp_int32 team_serial =
747 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
748 !taskdata->td_flags.proxy;
750 kmp_int32 team_serial =
751 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
753 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
755 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
756 KMP_DEBUG_ASSERT(children >= 0);
759 while (children == 0) {
760 kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
762 KA_TRACE(20, (
"__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
763 "and freeing itself\n",
767 __kmp_free_task(gtid, taskdata, thread);
769 taskdata = parent_taskdata;
775 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
776 if (taskdata->td_dephash) {
777 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
778 kmp_tasking_flags_t flags_old = taskdata->td_flags;
779 if (children == 0 && flags_old.complete == 1) {
780 kmp_tasking_flags_t flags_new = flags_old;
781 flags_new.complete = 0;
782 if (KMP_COMPARE_AND_STORE_ACQ32(
783 RCAST(kmp_int32 *, &taskdata->td_flags),
784 *RCAST(kmp_int32 *, &flags_old),
785 *RCAST(kmp_int32 *, &flags_new))) {
786 KA_TRACE(100, (
"__kmp_free_task_and_ancestors: T#%d cleans "
787 "dephash of implicit task %p\n",
790 __kmp_dephash_free_entries(thread, taskdata->td_dephash);
797 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
798 KMP_DEBUG_ASSERT(children >= 0);
802 20, (
"__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
803 "not freeing it yet\n",
804 gtid, taskdata, children));
813 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
814 kmp_taskdata_t *resumed_task) {
815 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
816 kmp_info_t *thread = __kmp_threads[gtid];
818 kmp_task_team_t *task_team =
819 thread->th.th_task_team;
820 #endif // OMP_45_ENABLED
821 kmp_int32 children = 0;
823 KA_TRACE(10, (
"__kmp_task_finish(enter): T#%d finishing task %p and resuming "
825 gtid, taskdata, resumed_task));
827 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
830 #ifdef BUILD_TIED_TASK_STACK
831 if (taskdata->td_flags.tiedness == TASK_TIED) {
832 __kmp_pop_task_stack(gtid, thread, taskdata);
836 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
839 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
842 (
"__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
843 gtid, counter, taskdata));
847 if (resumed_task == NULL) {
848 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
849 resumed_task = taskdata->td_parent;
852 thread->th.th_current_task = resumed_task;
853 resumed_task->td_flags.executing = 1;
854 KA_TRACE(10, (
"__kmp_task_finish(exit): T#%d partially done task %p, "
855 "resuming task %p\n",
856 gtid, taskdata, resumed_task));
862 __ompt_task_finish(task, resumed_task);
866 kmp_depnode_t *node = taskdata->td_depnode;
867 if (node && (node->dn.mtx_num_locks < 0)) {
869 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
870 for (
int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
871 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
872 __kmp_release_lock(node->dn.mtx_locks[i], gtid);
876 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
877 taskdata->td_flags.complete = 1;
878 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
879 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
883 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
886 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
887 KMP_DEBUG_ASSERT(children >= 0);
889 if (taskdata->td_taskgroup)
890 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
891 __kmp_release_deps(gtid, taskdata);
893 }
else if (task_team && task_team->tt.tt_found_proxy_tasks) {
896 __kmp_release_deps(gtid, taskdata);
897 #endif // OMP_45_ENABLED
898 #endif // OMP_40_ENABLED
904 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
905 taskdata->td_flags.executing = 0;
908 20, (
"__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
909 gtid, taskdata, children));
918 if (taskdata->td_flags.destructors_thunk) {
919 kmp_routine_entry_t destr_thunk = task->data1.destructors;
920 KMP_ASSERT(destr_thunk);
921 destr_thunk(gtid, task);
923 #endif // OMP_40_ENABLED
928 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
929 taskdata->td_flags.task_serial);
930 if (taskdata->td_flags.task_serial) {
931 if (resumed_task == NULL) {
932 resumed_task = taskdata->td_parent;
936 KMP_DEBUG_ASSERT(resumed_task !=
944 thread->th.th_current_task = resumed_task;
945 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
949 resumed_task->td_flags.executing = 1;
952 10, (
"__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
953 gtid, taskdata, resumed_task));
959 static void __kmpc_omp_task_complete_if0_template(
ident_t *loc_ref,
962 KA_TRACE(10, (
"__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
963 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
965 __kmp_task_finish<ompt>(gtid, task, NULL);
967 KA_TRACE(10, (
"__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
968 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
972 ompt_frame_t *ompt_frame;
973 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
974 ompt_frame->enter_frame = ompt_data_none;
975 ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
984 void __kmpc_omp_task_complete_if0_ompt(
ident_t *loc_ref, kmp_int32 gtid,
986 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
988 #endif // OMPT_SUPPORT
995 void __kmpc_omp_task_complete_if0(
ident_t *loc_ref, kmp_int32 gtid,
998 if (UNLIKELY(ompt_enabled.enabled)) {
999 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1003 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1009 void __kmpc_omp_task_complete(
ident_t *loc_ref, kmp_int32 gtid,
1011 KA_TRACE(10, (
"__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1012 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014 __kmp_task_finish<false>(gtid, task,
1017 KA_TRACE(10, (
"__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1018 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1021 #endif // TASK_UNUSED
1034 void __kmp_init_implicit_task(
ident_t *loc_ref, kmp_info_t *this_thr,
1035 kmp_team_t *team,
int tid,
int set_curr_task) {
1036 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1040 (
"__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1041 tid, team, task, set_curr_task ?
"TRUE" :
"FALSE"));
1043 task->td_task_id = KMP_GEN_TASK_ID();
1044 task->td_team = team;
1047 task->td_ident = loc_ref;
1048 task->td_taskwait_ident = NULL;
1049 task->td_taskwait_counter = 0;
1050 task->td_taskwait_thread = 0;
1052 task->td_flags.tiedness = TASK_TIED;
1053 task->td_flags.tasktype = TASK_IMPLICIT;
1055 task->td_flags.proxy = TASK_FULL;
1059 task->td_flags.task_serial = 1;
1060 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1061 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1063 task->td_flags.started = 1;
1064 task->td_flags.executing = 1;
1065 task->td_flags.complete = 0;
1066 task->td_flags.freed = 0;
1069 task->td_depnode = NULL;
1071 task->td_last_tied = task;
1073 if (set_curr_task) {
1074 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1076 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1078 task->td_taskgroup = NULL;
1079 task->td_dephash = NULL;
1081 __kmp_push_current_task_to_thread(this_thr, team, tid);
1083 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1084 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1088 if (UNLIKELY(ompt_enabled.enabled))
1089 __ompt_task_init(task, tid);
1092 KF_TRACE(10, (
"__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1101 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1102 kmp_taskdata_t *task = thread->th.th_current_task;
1103 if (task->td_dephash) {
1105 task->td_flags.complete = 1;
1106 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1107 kmp_tasking_flags_t flags_old = task->td_flags;
1108 if (children == 0 && flags_old.complete == 1) {
1109 kmp_tasking_flags_t flags_new = flags_old;
1110 flags_new.complete = 0;
1111 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1112 *RCAST(kmp_int32 *, &flags_old),
1113 *RCAST(kmp_int32 *, &flags_new))) {
1114 KA_TRACE(100, (
"__kmp_finish_implicit_task: T#%d cleans "
1115 "dephash of implicit task %p\n",
1116 thread->th.th_info.ds.ds_gtid, task));
1117 __kmp_dephash_free_entries(thread, task->td_dephash);
1127 void __kmp_free_implicit_task(kmp_info_t *thread) {
1128 kmp_taskdata_t *task = thread->th.th_current_task;
1129 if (task && task->td_dephash) {
1130 __kmp_dephash_free(thread, task->td_dephash);
1131 task->td_dephash = NULL;
1137 static size_t __kmp_round_up_to_val(
size_t size,
size_t val) {
1138 if (size & (val - 1)) {
1140 if (size <= KMP_SIZE_T_MAX - val) {
1159 kmp_task_t *__kmp_task_alloc(
ident_t *loc_ref, kmp_int32 gtid,
1160 kmp_tasking_flags_t *flags,
1161 size_t sizeof_kmp_task_t,
size_t sizeof_shareds,
1162 kmp_routine_entry_t task_entry) {
1164 kmp_taskdata_t *taskdata;
1165 kmp_info_t *thread = __kmp_threads[gtid];
1166 kmp_team_t *team = thread->th.th_team;
1167 kmp_taskdata_t *parent_task = thread->th.th_current_task;
1168 size_t shareds_offset;
1170 if (!TCR_4(__kmp_init_middle))
1171 __kmp_middle_initialize();
1173 KA_TRACE(10, (
"__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1174 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1175 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1176 sizeof_shareds, task_entry));
1178 if (parent_task->td_flags.final) {
1179 if (flags->merged_if0) {
1183 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1187 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1191 if (flags->proxy == TASK_PROXY) {
1192 flags->tiedness = TASK_UNTIED;
1193 flags->merged_if0 = 1;
1197 if ((thread->th.th_task_team) == NULL) {
1200 KMP_DEBUG_ASSERT(team->t.t_serialized);
1202 (
"T#%d creating task team in __kmp_task_alloc for proxy task\n",
1204 __kmp_task_team_setup(
1207 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1209 kmp_task_team_t *task_team = thread->th.th_task_team;
1212 if (!KMP_TASKING_ENABLED(task_team)) {
1215 (
"T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1216 __kmp_enable_tasking(task_team, thread);
1217 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1218 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1220 if (thread_data->td.td_deque == NULL) {
1221 __kmp_alloc_task_deque(thread, thread_data);
1225 if (task_team->tt.tt_found_proxy_tasks == FALSE)
1226 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1232 shareds_offset =
sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1233 shareds_offset = __kmp_round_up_to_val(shareds_offset,
sizeof(
void *));
1236 KA_TRACE(30, (
"__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1238 KA_TRACE(30, (
"__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1243 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1246 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1249 ANNOTATE_HAPPENS_AFTER(taskdata);
1251 task = KMP_TASKDATA_TO_TASK(taskdata);
1254 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1255 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (
sizeof(
double) - 1)) == 0);
1256 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (
sizeof(
double) - 1)) == 0);
1258 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (
sizeof(_Quad) - 1)) == 0);
1259 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (
sizeof(_Quad) - 1)) == 0);
1261 if (sizeof_shareds > 0) {
1263 task->shareds = &((
char *)taskdata)[shareds_offset];
1265 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (
sizeof(
void *) - 1)) ==
1268 task->shareds = NULL;
1270 task->routine = task_entry;
1273 taskdata->td_task_id = KMP_GEN_TASK_ID();
1274 taskdata->td_team = team;
1275 taskdata->td_alloc_thread = thread;
1276 taskdata->td_parent = parent_task;
1277 taskdata->td_level = parent_task->td_level + 1;
1278 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1279 taskdata->td_ident = loc_ref;
1280 taskdata->td_taskwait_ident = NULL;
1281 taskdata->td_taskwait_counter = 0;
1282 taskdata->td_taskwait_thread = 0;
1283 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1286 if (flags->proxy == TASK_FULL)
1288 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1290 taskdata->td_flags.tiedness = flags->tiedness;
1291 taskdata->td_flags.final = flags->final;
1292 taskdata->td_flags.merged_if0 = flags->merged_if0;
1294 taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1295 #endif // OMP_40_ENABLED
1297 taskdata->td_flags.proxy = flags->proxy;
1298 taskdata->td_task_team = thread->th.th_task_team;
1299 taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1301 taskdata->td_flags.tasktype = TASK_EXPLICIT;
1304 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1307 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1313 taskdata->td_flags.task_serial =
1314 (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1315 taskdata->td_flags.tasking_ser);
1317 taskdata->td_flags.started = 0;
1318 taskdata->td_flags.executing = 0;
1319 taskdata->td_flags.complete = 0;
1320 taskdata->td_flags.freed = 0;
1322 taskdata->td_flags.native = flags->native;
1324 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1326 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1328 taskdata->td_taskgroup =
1329 parent_task->td_taskgroup;
1330 taskdata->td_dephash = NULL;
1331 taskdata->td_depnode = NULL;
1333 if (flags->tiedness == TASK_UNTIED)
1334 taskdata->td_last_tied = NULL;
1336 taskdata->td_last_tied = taskdata;
1339 if (UNLIKELY(ompt_enabled.enabled))
1340 __ompt_task_init(taskdata, gtid);
1345 if (flags->proxy == TASK_PROXY ||
1346 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1348 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1351 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1353 if (parent_task->td_taskgroup)
1354 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1358 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1359 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1363 KA_TRACE(20, (
"__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1364 gtid, taskdata, taskdata->td_parent));
1365 ANNOTATE_HAPPENS_BEFORE(task);
1370 kmp_task_t *__kmpc_omp_task_alloc(
ident_t *loc_ref, kmp_int32 gtid,
1371 kmp_int32 flags,
size_t sizeof_kmp_task_t,
1372 size_t sizeof_shareds,
1373 kmp_routine_entry_t task_entry) {
1375 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1377 input_flags->native = FALSE;
1381 KA_TRACE(10, (
"__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1382 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1383 gtid, loc_ref, input_flags->tiedness ?
"tied " :
"untied",
1384 input_flags->proxy ?
"proxy" :
"", sizeof_kmp_task_t,
1385 sizeof_shareds, task_entry));
1387 KA_TRACE(10, (
"__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1388 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1389 gtid, loc_ref, input_flags->tiedness ?
"tied " :
"untied",
1390 sizeof_kmp_task_t, sizeof_shareds, task_entry));
1393 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1394 sizeof_shareds, task_entry);
1396 KA_TRACE(20, (
"__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1416 __kmpc_omp_reg_task_with_affinity(
ident_t *loc_ref, kmp_int32 gtid,
1417 kmp_task_t *new_task, kmp_int32 naffins,
1418 kmp_task_affinity_info_t *affin_list) {
1428 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1429 kmp_taskdata_t *current_task) {
1430 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1436 30, (
"__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1437 gtid, taskdata, current_task));
1438 KMP_DEBUG_ASSERT(task);
1440 if (taskdata->td_flags.proxy == TASK_PROXY &&
1441 taskdata->td_flags.complete == 1) {
1446 (
"__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1449 __kmp_bottom_half_finish_proxy(gtid, task);
1451 KA_TRACE(30, (
"__kmp_invoke_task(exit): T#%d completed bottom finish for "
1452 "proxy task %p, resuming task %p\n",
1453 gtid, taskdata, current_task));
1462 ompt_thread_info_t oldInfo;
1463 if (UNLIKELY(ompt_enabled.enabled)) {
1465 thread = __kmp_threads[gtid];
1466 oldInfo = thread->th.ompt_thread_info;
1467 thread->th.ompt_thread_info.wait_id = 0;
1468 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1469 ? ompt_state_work_serial
1470 : ompt_state_work_parallel;
1471 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1477 if (taskdata->td_flags.proxy != TASK_PROXY) {
1479 ANNOTATE_HAPPENS_AFTER(task);
1480 __kmp_task_start(gtid, task, current_task);
1489 if (__kmp_omp_cancellation) {
1490 thread = __kmp_threads[gtid];
1491 kmp_team_t *this_team = thread->th.th_team;
1492 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1493 if ((taskgroup && taskgroup->cancel_request) ||
1494 (this_team->t.t_cancel_request == cancel_parallel)) {
1495 #if OMPT_SUPPORT && OMPT_OPTIONAL
1496 ompt_data_t *task_data;
1497 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1498 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1499 ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1501 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1502 : ompt_cancel_parallel) |
1503 ompt_cancel_discarded_task,
1516 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1517 taskdata->td_last_tied = current_task->td_last_tied;
1518 KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1520 #if KMP_STATS_ENABLED
1522 switch (KMP_GET_THREAD_STATE()) {
1523 case FORK_JOIN_BARRIER:
1524 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1527 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1530 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1533 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1536 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1539 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1542 #endif // KMP_STATS_ENABLED
1543 #endif // OMP_40_ENABLED
1547 if (UNLIKELY(ompt_enabled.enabled))
1548 __ompt_task_start(task, current_task, gtid);
1551 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1552 kmp_uint64 cur_time;
1553 kmp_int32 kmp_itt_count_task =
1554 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1555 current_task->td_flags.tasktype == TASK_IMPLICIT;
1556 if (kmp_itt_count_task) {
1557 thread = __kmp_threads[gtid];
1559 if (thread->th.th_bar_arrive_time)
1560 cur_time = __itt_get_timestamp();
1562 kmp_itt_count_task = 0;
1566 #ifdef KMP_GOMP_COMPAT
1567 if (taskdata->td_flags.native) {
1568 ((void (*)(
void *))(*(task->routine)))(task->shareds);
1572 (*(task->routine))(gtid, task);
1574 KMP_POP_PARTITIONED_TIMER();
1576 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1577 if (kmp_itt_count_task) {
1579 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1585 #endif // OMP_40_ENABLED
1590 if (taskdata->td_flags.proxy != TASK_PROXY) {
1592 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1594 if (UNLIKELY(ompt_enabled.enabled)) {
1595 thread->th.ompt_thread_info = oldInfo;
1596 if (taskdata->td_flags.tiedness == TASK_TIED) {
1597 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1599 __kmp_task_finish<true>(gtid, task, current_task);
1602 __kmp_task_finish<false>(gtid, task, current_task);
1609 (
"__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1610 gtid, taskdata, current_task));
1624 kmp_int32 __kmpc_omp_task_parts(
ident_t *loc_ref, kmp_int32 gtid,
1625 kmp_task_t *new_task) {
1626 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1628 KA_TRACE(10, (
"__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1629 loc_ref, new_taskdata));
1632 kmp_taskdata_t *parent;
1633 if (UNLIKELY(ompt_enabled.enabled)) {
1634 parent = new_taskdata->td_parent;
1635 if (ompt_enabled.ompt_callback_task_create) {
1636 ompt_data_t task_data = ompt_data_none;
1637 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1638 parent ? &(parent->ompt_task_info.task_data) : &task_data,
1639 parent ? &(parent->ompt_task_info.frame) : NULL,
1640 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1641 OMPT_GET_RETURN_ADDRESS(0));
1649 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED)
1651 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1652 new_taskdata->td_flags.task_serial = 1;
1653 __kmp_invoke_task(gtid, new_task, current_task);
1658 (
"__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1659 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1660 gtid, loc_ref, new_taskdata));
1662 ANNOTATE_HAPPENS_BEFORE(new_task);
1664 if (UNLIKELY(ompt_enabled.enabled)) {
1665 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1668 return TASK_CURRENT_NOT_QUEUED;
1682 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1683 bool serialize_immediate) {
1684 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1689 if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1690 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED)
1692 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED)
1695 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1696 if (serialize_immediate)
1697 new_taskdata->td_flags.task_serial = 1;
1698 __kmp_invoke_task(gtid, new_task, current_task);
1701 ANNOTATE_HAPPENS_BEFORE(new_task);
1702 return TASK_CURRENT_NOT_QUEUED;
1717 kmp_int32 __kmpc_omp_task(
ident_t *loc_ref, kmp_int32 gtid,
1718 kmp_task_t *new_task) {
1720 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1722 #if KMP_DEBUG || OMPT_SUPPORT
1723 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1725 KA_TRACE(10, (
"__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1729 kmp_taskdata_t *parent = NULL;
1730 if (UNLIKELY(ompt_enabled.enabled)) {
1731 if (!new_taskdata->td_flags.started) {
1732 OMPT_STORE_RETURN_ADDRESS(gtid);
1733 parent = new_taskdata->td_parent;
1734 if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1735 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1737 if (ompt_enabled.ompt_callback_task_create) {
1738 ompt_data_t task_data = ompt_data_none;
1739 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1740 parent ? &(parent->ompt_task_info.task_data) : &task_data,
1741 parent ? &(parent->ompt_task_info.frame) : NULL,
1742 &(new_taskdata->ompt_task_info.task_data),
1743 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1744 OMPT_LOAD_RETURN_ADDRESS(gtid));
1749 __ompt_task_finish(new_task,
1750 new_taskdata->ompt_task_info.scheduling_parent,
1752 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1757 res = __kmp_omp_task(gtid, new_task,
true);
1759 KA_TRACE(10, (
"__kmpc_omp_task(exit): T#%d returning "
1760 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1761 gtid, loc_ref, new_taskdata));
1763 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1764 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1783 kmp_int32 __kmp_omp_taskloop_task(
ident_t *loc_ref, kmp_int32 gtid,
1784 kmp_task_t *new_task,
void *codeptr_ra) {
1786 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1788 #if KMP_DEBUG || OMPT_SUPPORT
1789 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1791 KA_TRACE(10, (
"__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1795 kmp_taskdata_t *parent = NULL;
1796 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1797 parent = new_taskdata->td_parent;
1798 if (!parent->ompt_task_info.frame.enter_frame.ptr)
1799 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1800 if (ompt_enabled.ompt_callback_task_create) {
1801 ompt_data_t task_data = ompt_data_none;
1802 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1803 parent ? &(parent->ompt_task_info.task_data) : &task_data,
1804 parent ? &(parent->ompt_task_info.frame) : NULL,
1805 &(new_taskdata->ompt_task_info.task_data),
1806 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1812 res = __kmp_omp_task(gtid, new_task,
true);
1814 KA_TRACE(10, (
"__kmpc_omp_task(exit): T#%d returning "
1815 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1816 gtid, loc_ref, new_taskdata));
1818 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1819 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1825 template <
bool ompt>
1826 static kmp_int32 __kmpc_omp_taskwait_template(
ident_t *loc_ref, kmp_int32 gtid,
1827 void *frame_address,
1828 void *return_address) {
1829 kmp_taskdata_t *taskdata;
1831 int thread_finished = FALSE;
1832 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1834 KA_TRACE(10, (
"__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1836 if (__kmp_tasking_mode != tskm_immediate_exec) {
1837 thread = __kmp_threads[gtid];
1838 taskdata = thread->th.th_current_task;
1840 #if OMPT_SUPPORT && OMPT_OPTIONAL
1841 ompt_data_t *my_task_data;
1842 ompt_data_t *my_parallel_data;
1845 my_task_data = &(taskdata->ompt_task_info.task_data);
1846 my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1848 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1850 if (ompt_enabled.ompt_callback_sync_region) {
1851 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1852 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1853 my_task_data, return_address);
1856 if (ompt_enabled.ompt_callback_sync_region_wait) {
1857 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1858 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1859 my_task_data, return_address);
1862 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1869 taskdata->td_taskwait_counter += 1;
1870 taskdata->td_taskwait_ident = loc_ref;
1871 taskdata->td_taskwait_thread = gtid + 1;
1874 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1875 if (itt_sync_obj != NULL)
1876 __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1880 !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1883 must_wait = must_wait || (thread->th.th_task_team != NULL &&
1884 thread->th.th_task_team->tt.tt_found_proxy_tasks);
1887 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1888 &(taskdata->td_incomplete_child_tasks)),
1890 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1891 flag.execute_tasks(thread, gtid, FALSE,
1892 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1893 __kmp_task_stealing_constraint);
1897 if (itt_sync_obj != NULL)
1898 __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1903 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1905 #if OMPT_SUPPORT && OMPT_OPTIONAL
1907 if (ompt_enabled.ompt_callback_sync_region_wait) {
1908 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1909 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1910 my_task_data, return_address);
1912 if (ompt_enabled.ompt_callback_sync_region) {
1913 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1914 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1915 my_task_data, return_address);
1917 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1919 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1921 ANNOTATE_HAPPENS_AFTER(taskdata);
1924 KA_TRACE(10, (
"__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1925 "returning TASK_CURRENT_NOT_QUEUED\n",
1928 return TASK_CURRENT_NOT_QUEUED;
1931 #if OMPT_SUPPORT && OMPT_OPTIONAL
1933 static kmp_int32 __kmpc_omp_taskwait_ompt(
ident_t *loc_ref, kmp_int32 gtid,
1934 void *frame_address,
1935 void *return_address) {
1936 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1939 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1943 kmp_int32 __kmpc_omp_taskwait(
ident_t *loc_ref, kmp_int32 gtid) {
1944 #if OMPT_SUPPORT && OMPT_OPTIONAL
1945 if (UNLIKELY(ompt_enabled.enabled)) {
1946 OMPT_STORE_RETURN_ADDRESS(gtid);
1947 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1948 OMPT_LOAD_RETURN_ADDRESS(gtid));
1951 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1955 kmp_int32 __kmpc_omp_taskyield(
ident_t *loc_ref, kmp_int32 gtid,
int end_part) {
1956 kmp_taskdata_t *taskdata;
1958 int thread_finished = FALSE;
1961 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1963 KA_TRACE(10, (
"__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1964 gtid, loc_ref, end_part));
1966 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1967 thread = __kmp_threads[gtid];
1968 taskdata = thread->th.th_current_task;
1975 taskdata->td_taskwait_counter += 1;
1976 taskdata->td_taskwait_ident = loc_ref;
1977 taskdata->td_taskwait_thread = gtid + 1;
1980 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1981 if (itt_sync_obj != NULL)
1982 __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1984 if (!taskdata->td_flags.team_serial) {
1985 kmp_task_team_t *task_team = thread->th.th_task_team;
1986 if (task_team != NULL) {
1987 if (KMP_TASKING_ENABLED(task_team)) {
1989 if (UNLIKELY(ompt_enabled.enabled))
1990 thread->th.ompt_thread_info.ompt_task_yielded = 1;
1992 __kmp_execute_tasks_32(
1993 thread, gtid, NULL, FALSE,
1994 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1995 __kmp_task_stealing_constraint);
1997 if (UNLIKELY(ompt_enabled.enabled))
1998 thread->th.ompt_thread_info.ompt_task_yielded = 0;
2004 if (itt_sync_obj != NULL)
2005 __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2010 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2013 KA_TRACE(10, (
"__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2014 "returning TASK_CURRENT_NOT_QUEUED\n",
2017 return TASK_CURRENT_NOT_QUEUED;
2036 typedef struct kmp_taskred_flags {
2038 unsigned lazy_priv : 1;
2039 unsigned reserved31 : 31;
2040 } kmp_taskred_flags_t;
2045 typedef struct kmp_task_red_input {
2052 kmp_taskred_flags_t flags;
2053 } kmp_task_red_input_t;
2058 typedef struct kmp_taskred_data {
2061 kmp_taskred_flags_t flags;
2069 } kmp_taskred_data_t;
2076 typedef struct kmp_taskred_input {
2084 kmp_taskred_flags_t flags;
2085 } kmp_taskred_input_t;
2090 template <
typename T>
void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2092 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2093 kmp_task_red_input_t &src) {
2094 item.reduce_orig = NULL;
2097 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2098 kmp_taskred_input_t &src) {
2099 if (src.reduce_orig != NULL) {
2100 item.reduce_orig = src.reduce_orig;
2102 item.reduce_orig = src.reduce_shar;
2106 template <
typename T>
void __kmp_call_init(kmp_taskred_data_t &item,
int j);
2108 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2110 ((void (*)(
void *))item.reduce_init)((
char *)(item.reduce_priv) + offset);
2113 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2115 ((void (*)(
void *,
void *))item.reduce_init)(
2116 (
char *)(item.reduce_priv) + offset, item.reduce_orig);
2119 template <
typename T>
2120 void *__kmp_task_reduction_init(
int gtid,
int num, T *data) {
2121 kmp_info_t *thread = __kmp_threads[gtid];
2122 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2123 kmp_int32 nth = thread->th.th_team_nproc;
2124 kmp_taskred_data_t *arr;
2127 KMP_ASSERT(tg != NULL);
2128 KMP_ASSERT(data != NULL);
2129 KMP_ASSERT(num > 0);
2131 KA_TRACE(10, (
"__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2135 KA_TRACE(10, (
"__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2137 arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2138 thread, num *
sizeof(kmp_taskred_data_t));
2139 for (
int i = 0; i < num; ++i) {
2140 size_t size = data[i].reduce_size - 1;
2142 size += CACHE_LINE - size % CACHE_LINE;
2143 KMP_ASSERT(data[i].reduce_comb != NULL);
2144 arr[i].reduce_shar = data[i].reduce_shar;
2145 arr[i].reduce_size = size;
2146 arr[i].flags = data[i].flags;
2147 arr[i].reduce_comb = data[i].reduce_comb;
2148 arr[i].reduce_init = data[i].reduce_init;
2149 arr[i].reduce_fini = data[i].reduce_fini;
2150 __kmp_assign_orig<T>(arr[i], data[i]);
2151 if (!arr[i].flags.lazy_priv) {
2153 arr[i].reduce_priv = __kmp_allocate(nth * size);
2154 arr[i].reduce_pend = (
char *)(arr[i].reduce_priv) + nth * size;
2155 if (arr[i].reduce_init != NULL) {
2157 for (
int j = 0; j < nth; ++j) {
2158 __kmp_call_init<T>(arr[i], j * size);
2165 arr[i].reduce_priv = __kmp_allocate(nth *
sizeof(
void *));
2168 tg->reduce_data = (
void *)arr;
2169 tg->reduce_num_data = num;
2187 void *__kmpc_task_reduction_init(
int gtid,
int num,
void *data) {
2188 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2203 void *__kmpc_taskred_init(
int gtid,
int num,
void *data) {
2204 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2208 template <
typename T>
2209 void __kmp_task_reduction_init_copy(kmp_info_t *thr,
int num, T *data,
2210 kmp_taskgroup_t *tg,
void *reduce_data) {
2211 kmp_taskred_data_t *arr;
2212 KA_TRACE(20, (
"__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2214 thr, tg, reduce_data));
2215 arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2216 thr, num *
sizeof(kmp_taskred_data_t));
2218 KMP_MEMCPY(arr, reduce_data, num *
sizeof(kmp_taskred_data_t));
2219 for (
int i = 0; i < num; ++i) {
2220 arr[i].reduce_shar = data[i].reduce_shar;
2222 tg->reduce_data = (
void *)arr;
2223 tg->reduce_num_data = num;
2235 void *__kmpc_task_reduction_get_th_data(
int gtid,
void *tskgrp,
void *data) {
2236 kmp_info_t *thread = __kmp_threads[gtid];
2237 kmp_int32 nth = thread->th.th_team_nproc;
2241 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2243 tg = thread->th.th_current_task->td_taskgroup;
2244 KMP_ASSERT(tg != NULL);
2245 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2246 kmp_int32 num = tg->reduce_num_data;
2247 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2249 KMP_ASSERT(data != NULL);
2250 while (tg != NULL) {
2251 for (
int i = 0; i < num; ++i) {
2252 if (!arr[i].flags.lazy_priv) {
2253 if (data == arr[i].reduce_shar ||
2254 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2255 return (
char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2258 void **p_priv = (
void **)(arr[i].reduce_priv);
2259 if (data == arr[i].reduce_shar)
2262 for (
int j = 0; j < nth; ++j)
2263 if (data == p_priv[j])
2267 if (p_priv[tid] == NULL) {
2269 p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2270 if (arr[i].reduce_init != NULL) {
2271 if (arr[i].reduce_orig != NULL) {
2272 ((void (*)(
void *,
void *))arr[i].reduce_init)(
2273 p_priv[tid], arr[i].reduce_orig);
2275 ((void (*)(
void *))arr[i].reduce_init)(p_priv[tid]);
2283 arr = (kmp_taskred_data_t *)(tg->reduce_data);
2284 num = tg->reduce_num_data;
2286 KMP_ASSERT2(0,
"Unknown task reduction item");
2292 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2293 kmp_int32 nth = th->th.th_team_nproc;
2294 KMP_DEBUG_ASSERT(nth > 1);
2295 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2296 kmp_int32 num = tg->reduce_num_data;
2297 for (
int i = 0; i < num; ++i) {
2298 void *sh_data = arr[i].reduce_shar;
2299 void (*f_fini)(
void *) = (
void (*)(
void *))(arr[i].reduce_fini);
2300 void (*f_comb)(
void *,
void *) =
2301 (
void (*)(
void *,
void *))(arr[i].reduce_comb);
2302 if (!arr[i].flags.lazy_priv) {
2303 void *pr_data = arr[i].reduce_priv;
2304 size_t size = arr[i].reduce_size;
2305 for (
int j = 0; j < nth; ++j) {
2306 void *priv_data = (
char *)pr_data + j * size;
2307 f_comb(sh_data, priv_data);
2312 void **pr_data = (
void **)(arr[i].reduce_priv);
2313 for (
int j = 0; j < nth; ++j) {
2314 if (pr_data[j] != NULL) {
2315 f_comb(sh_data, pr_data[j]);
2318 __kmp_free(pr_data[j]);
2322 __kmp_free(arr[i].reduce_priv);
2324 __kmp_thread_free(th, arr);
2325 tg->reduce_data = NULL;
2326 tg->reduce_num_data = 0;
2332 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2333 __kmp_thread_free(th, tg->reduce_data);
2334 tg->reduce_data = NULL;
2335 tg->reduce_num_data = 0;
2338 template <
typename T>
2339 void *__kmp_task_reduction_modifier_init(
ident_t *loc,
int gtid,
int is_ws,
2341 kmp_info_t *thr = __kmp_threads[gtid];
2342 kmp_int32 nth = thr->th.th_team_nproc;
2343 __kmpc_taskgroup(loc, gtid);
2346 (
"__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2347 gtid, thr->th.th_current_task->td_taskgroup));
2348 return (
void *)thr->th.th_current_task->td_taskgroup;
2350 kmp_team_t *team = thr->th.th_team;
2352 kmp_taskgroup_t *tg;
2353 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2354 if (reduce_data == NULL &&
2355 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2358 KMP_DEBUG_ASSERT(reduce_data == NULL);
2360 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2361 reduce_data = __kmp_thread_malloc(thr, num *
sizeof(kmp_taskred_data_t));
2362 KMP_MEMCPY(reduce_data, tg->reduce_data, num *
sizeof(kmp_taskred_data_t));
2364 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2365 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2366 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2369 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2373 KMP_DEBUG_ASSERT(reduce_data > (
void *)1);
2374 tg = thr->th.th_current_task->td_taskgroup;
2375 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2396 void *__kmpc_task_reduction_modifier_init(
ident_t *loc,
int gtid,
int is_ws,
2397 int num,
void *data) {
2398 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2399 (kmp_task_red_input_t *)data);
2416 void *__kmpc_taskred_modifier_init(
ident_t *loc,
int gtid,
int is_ws,
int num,
2418 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2419 (kmp_taskred_input_t *)data);
2430 void __kmpc_task_reduction_modifier_fini(
ident_t *loc,
int gtid,
int is_ws) {
2431 __kmpc_end_taskgroup(loc, gtid);
2437 void __kmpc_taskgroup(
ident_t *loc,
int gtid) {
2438 kmp_info_t *thread = __kmp_threads[gtid];
2439 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2440 kmp_taskgroup_t *tg_new =
2441 (kmp_taskgroup_t *)__kmp_thread_malloc(thread,
sizeof(kmp_taskgroup_t));
2442 KA_TRACE(10, (
"__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2443 KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2444 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2445 tg_new->parent = taskdata->td_taskgroup;
2447 tg_new->reduce_data = NULL;
2448 tg_new->reduce_num_data = 0;
2450 taskdata->td_taskgroup = tg_new;
2452 #if OMPT_SUPPORT && OMPT_OPTIONAL
2453 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2454 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2456 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2457 kmp_team_t *team = thread->th.th_team;
2458 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2460 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2462 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2463 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2464 &(my_task_data), codeptr);
2471 void __kmpc_end_taskgroup(
ident_t *loc,
int gtid) {
2472 kmp_info_t *thread = __kmp_threads[gtid];
2473 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2474 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2475 int thread_finished = FALSE;
2477 #if OMPT_SUPPORT && OMPT_OPTIONAL
2479 ompt_data_t my_task_data;
2480 ompt_data_t my_parallel_data;
2482 if (UNLIKELY(ompt_enabled.enabled)) {
2483 team = thread->th.th_team;
2484 my_task_data = taskdata->ompt_task_info.task_data;
2486 my_parallel_data = team->t.ompt_team_info.parallel_data;
2487 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2489 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2493 KA_TRACE(10, (
"__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2494 KMP_DEBUG_ASSERT(taskgroup != NULL);
2495 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2497 if (__kmp_tasking_mode != tskm_immediate_exec) {
2499 taskdata->td_taskwait_counter += 1;
2500 taskdata->td_taskwait_ident = loc;
2501 taskdata->td_taskwait_thread = gtid + 1;
2505 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2506 if (itt_sync_obj != NULL)
2507 __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2510 #if OMPT_SUPPORT && OMPT_OPTIONAL
2511 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2512 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2513 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2514 &(my_task_data), codeptr);
2519 if (!taskdata->td_flags.team_serial ||
2520 (thread->th.th_task_team != NULL &&
2521 thread->th.th_task_team->tt.tt_found_proxy_tasks))
2523 if (!taskdata->td_flags.team_serial)
2526 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2528 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2529 flag.execute_tasks(thread, gtid, FALSE,
2530 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2531 __kmp_task_stealing_constraint);
2534 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2536 #if OMPT_SUPPORT && OMPT_OPTIONAL
2537 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2538 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2539 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2540 &(my_task_data), codeptr);
2545 if (itt_sync_obj != NULL)
2546 __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2549 KMP_DEBUG_ASSERT(taskgroup->count == 0);
2552 if (taskgroup->reduce_data != NULL) {
2555 kmp_team_t *t = thread->th.th_team;
2556 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2558 void *priv0 = arr[0].reduce_priv;
2559 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2560 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2562 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2563 if (cnt == thread->th.th_team_nproc - 1) {
2566 __kmp_task_reduction_fini(thread, taskgroup);
2569 __kmp_thread_free(thread, reduce_data);
2570 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2571 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2575 __kmp_task_reduction_clean(thread, taskgroup);
2577 }
else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2579 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2581 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2582 if (cnt == thread->th.th_team_nproc - 1) {
2584 __kmp_task_reduction_fini(thread, taskgroup);
2587 __kmp_thread_free(thread, reduce_data);
2588 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2589 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2593 __kmp_task_reduction_clean(thread, taskgroup);
2597 __kmp_task_reduction_fini(thread, taskgroup);
2602 taskdata->td_taskgroup = taskgroup->parent;
2603 __kmp_thread_free(thread, taskgroup);
2605 KA_TRACE(10, (
"__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2607 ANNOTATE_HAPPENS_AFTER(taskdata);
2609 #if OMPT_SUPPORT && OMPT_OPTIONAL
2610 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2611 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2612 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2613 &(my_task_data), codeptr);
2620 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2621 kmp_task_team_t *task_team,
2622 kmp_int32 is_constrained) {
2624 kmp_taskdata_t *taskdata;
2625 kmp_thread_data_t *thread_data;
2628 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2629 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2632 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2634 KA_TRACE(10, (
"__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2635 gtid, thread_data->td.td_deque_ntasks,
2636 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2638 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2640 (
"__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2641 "ntasks=%d head=%u tail=%u\n",
2642 gtid, thread_data->td.td_deque_ntasks,
2643 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2647 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2649 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2650 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2652 (
"__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2653 "ntasks=%d head=%u tail=%u\n",
2654 gtid, thread_data->td.td_deque_ntasks,
2655 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2659 tail = (thread_data->td.td_deque_tail - 1) &
2660 TASK_DEQUE_MASK(thread_data->td);
2661 taskdata = thread_data->td.td_deque[tail];
2663 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2664 thread->th.th_current_task)) {
2666 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2668 (
"__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2669 "ntasks=%d head=%u tail=%u\n",
2670 gtid, thread_data->td.td_deque_ntasks,
2671 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2675 thread_data->td.td_deque_tail = tail;
2676 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2678 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2680 KA_TRACE(10, (
"__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2681 "ntasks=%d head=%u tail=%u\n",
2682 gtid, taskdata, thread_data->td.td_deque_ntasks,
2683 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2685 task = KMP_TASKDATA_TO_TASK(taskdata);
2692 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2693 kmp_task_team_t *task_team,
2694 std::atomic<kmp_int32> *unfinished_threads,
2695 int *thread_finished,
2696 kmp_int32 is_constrained) {
2698 kmp_taskdata_t *taskdata;
2699 kmp_taskdata_t *current;
2700 kmp_thread_data_t *victim_td, *threads_data;
2702 kmp_int32 victim_tid;
2704 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2706 threads_data = task_team->tt.tt_threads_data;
2707 KMP_DEBUG_ASSERT(threads_data != NULL);
2709 victim_tid = victim_thr->th.th_info.ds.ds_tid;
2710 victim_td = &threads_data[victim_tid];
2712 KA_TRACE(10, (
"__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2713 "task_team=%p ntasks=%d head=%u tail=%u\n",
2714 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2715 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2716 victim_td->td.td_deque_tail));
2718 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2719 KA_TRACE(10, (
"__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2720 "task_team=%p ntasks=%d head=%u tail=%u\n",
2721 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2722 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2723 victim_td->td.td_deque_tail));
2727 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2729 int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2732 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2733 KA_TRACE(10, (
"__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2734 "task_team=%p ntasks=%d head=%u tail=%u\n",
2735 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2736 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2740 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2741 current = __kmp_threads[gtid]->th.th_current_task;
2742 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2743 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2745 victim_td->td.td_deque_head =
2746 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2748 if (!task_team->tt.tt_untied_task_encountered) {
2750 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2751 KA_TRACE(10, (
"__kmp_steal_task(exit #3): T#%d could not steal from "
2752 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2753 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2754 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2759 target = victim_td->td.td_deque_head;
2761 for (i = 1; i < ntasks; ++i) {
2762 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2763 taskdata = victim_td->td.td_deque[target];
2764 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2770 if (taskdata == NULL) {
2772 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2773 KA_TRACE(10, (
"__kmp_steal_task(exit #4): T#%d could not steal from "
2774 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2775 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2776 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2780 for (i = i + 1; i < ntasks; ++i) {
2782 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2783 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2787 victim_td->td.td_deque_tail ==
2788 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2789 victim_td->td.td_deque_tail = target;
2791 if (*thread_finished) {
2797 count = KMP_ATOMIC_INC(unfinished_threads);
2801 (
"__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2802 gtid, count + 1, task_team));
2804 *thread_finished = FALSE;
2806 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2808 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2812 (
"__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2813 "task_team=%p ntasks=%d head=%u tail=%u\n",
2814 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2815 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2817 task = KMP_TASKDATA_TO_TASK(taskdata);
2831 static inline int __kmp_execute_tasks_template(
2832 kmp_info_t *thread, kmp_int32 gtid, C *flag,
int final_spin,
2833 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
2834 kmp_int32 is_constrained) {
2835 kmp_task_team_t *task_team = thread->th.th_task_team;
2836 kmp_thread_data_t *threads_data;
2838 kmp_info_t *other_thread;
2839 kmp_taskdata_t *current_task = thread->th.th_current_task;
2840 std::atomic<kmp_int32> *unfinished_threads;
2841 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2842 tid = thread->th.th_info.ds.ds_tid;
2844 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2845 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2847 if (task_team == NULL || current_task == NULL)
2850 KA_TRACE(15, (
"__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2851 "*thread_finished=%d\n",
2852 gtid, final_spin, *thread_finished));
2854 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2855 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2856 KMP_DEBUG_ASSERT(threads_data != NULL);
2858 nthreads = task_team->tt.tt_nproc;
2859 unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2861 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2863 KMP_DEBUG_ASSERT(nthreads > 1);
2865 KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2871 if (use_own_tasks) {
2872 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2874 if ((task == NULL) && (nthreads > 1)) {
2878 if (victim_tid == -2) {
2879 victim_tid = threads_data[tid].td.td_deque_last_stolen;
2882 other_thread = threads_data[victim_tid].td.td_thr;
2884 if (victim_tid != -1) {
2886 }
else if (!new_victim) {
2892 victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2893 if (victim_tid >= tid) {
2897 other_thread = threads_data[victim_tid].td.td_thr;
2907 if ((__kmp_tasking_mode == tskm_task_teams) &&
2908 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2909 (TCR_PTR(CCAST(
void *, other_thread->th.th_sleep_loc)) !=
2912 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2913 other_thread->th.th_sleep_loc);
2926 task = __kmp_steal_task(other_thread, gtid, task_team,
2927 unfinished_threads, thread_finished,
2931 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2932 threads_data[tid].td.td_deque_last_stolen = victim_tid;
2939 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2948 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2949 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2950 if (itt_sync_obj == NULL) {
2952 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2954 __kmp_itt_task_starting(itt_sync_obj);
2957 __kmp_invoke_task(gtid, task, current_task);
2959 if (itt_sync_obj != NULL)
2960 __kmp_itt_task_finished(itt_sync_obj);
2967 if (flag == NULL || (!final_spin && flag->done_check())) {
2970 (
"__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2974 if (thread->th.th_task_team == NULL) {
2977 KMP_YIELD(__kmp_library == library_throughput);
2980 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2981 KA_TRACE(20, (
"__kmp_execute_tasks_template: T#%d stolen task spawned "
2982 "other tasks, restart\n",
2995 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0)
3003 if (!*thread_finished) {
3006 count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
3007 KA_TRACE(20, (
"__kmp_execute_tasks_template: T#%d dec "
3008 "unfinished_threads to %d task_team=%p\n",
3009 gtid, count, task_team));
3010 *thread_finished = TRUE;
3018 if (flag != NULL && flag->done_check()) {
3021 (
"__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3029 if (thread->th.th_task_team == NULL) {
3031 (
"__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3044 (
"__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3050 int __kmp_execute_tasks_32(
3051 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag,
int final_spin,
3052 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
3053 kmp_int32 is_constrained) {
3054 return __kmp_execute_tasks_template(
3055 thread, gtid, flag, final_spin,
3056 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3059 int __kmp_execute_tasks_64(
3060 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag,
int final_spin,
3061 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
3062 kmp_int32 is_constrained) {
3063 return __kmp_execute_tasks_template(
3064 thread, gtid, flag, final_spin,
3065 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3068 int __kmp_execute_tasks_oncore(
3069 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag,
int final_spin,
3070 int *thread_finished USE_ITT_BUILD_ARG(
void *itt_sync_obj),
3071 kmp_int32 is_constrained) {
3072 return __kmp_execute_tasks_template(
3073 thread, gtid, flag, final_spin,
3074 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3080 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3081 kmp_info_t *this_thr) {
3082 kmp_thread_data_t *threads_data;
3083 int nthreads, i, is_init_thread;
3085 KA_TRACE(10, (
"__kmp_enable_tasking(enter): T#%d\n",
3086 __kmp_gtid_from_thread(this_thr)));
3088 KMP_DEBUG_ASSERT(task_team != NULL);
3089 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3091 nthreads = task_team->tt.tt_nproc;
3092 KMP_DEBUG_ASSERT(nthreads > 0);
3093 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3096 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3098 if (!is_init_thread) {
3102 (
"__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3103 __kmp_gtid_from_thread(this_thr)));
3106 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3107 KMP_DEBUG_ASSERT(threads_data != NULL);
3109 if (__kmp_tasking_mode == tskm_task_teams &&
3110 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3114 for (i = 0; i < nthreads; i++) {
3115 volatile void *sleep_loc;
3116 kmp_info_t *thread = threads_data[i].td.td_thr;
3118 if (i == this_thr->th.th_info.ds.ds_tid) {
3127 if ((sleep_loc = TCR_PTR(CCAST(
void *, thread->th.th_sleep_loc))) !=
3129 KF_TRACE(50, (
"__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3130 __kmp_gtid_from_thread(this_thr),
3131 __kmp_gtid_from_thread(thread)));
3132 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3134 KF_TRACE(50, (
"__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3135 __kmp_gtid_from_thread(this_thr),
3136 __kmp_gtid_from_thread(thread)));
3141 KA_TRACE(10, (
"__kmp_enable_tasking(exit): T#%d\n",
3142 __kmp_gtid_from_thread(this_thr)));
3179 static kmp_task_team_t *__kmp_free_task_teams =
3182 kmp_bootstrap_lock_t __kmp_task_team_lock =
3183 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3190 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3191 kmp_thread_data_t *thread_data) {
3192 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3193 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3196 thread_data->td.td_deque_last_stolen = -1;
3198 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3199 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3200 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3204 (
"__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3205 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3209 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3210 INITIAL_TASK_DEQUE_SIZE *
sizeof(kmp_taskdata_t *));
3211 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3217 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3218 if (thread_data->td.td_deque != NULL) {
3219 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3220 TCW_4(thread_data->td.td_deque_ntasks, 0);
3221 __kmp_free(thread_data->td.td_deque);
3222 thread_data->td.td_deque = NULL;
3223 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3226 #ifdef BUILD_TIED_TASK_STACK
3228 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3229 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3231 #endif // BUILD_TIED_TASK_STACK
3241 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3242 kmp_task_team_t *task_team) {
3243 kmp_thread_data_t **threads_data_p;
3244 kmp_int32 nthreads, maxthreads;
3245 int is_init_thread = FALSE;
3247 if (TCR_4(task_team->tt.tt_found_tasks)) {
3252 threads_data_p = &task_team->tt.tt_threads_data;
3253 nthreads = task_team->tt.tt_nproc;
3254 maxthreads = task_team->tt.tt_max_threads;
3259 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3261 if (!TCR_4(task_team->tt.tt_found_tasks)) {
3263 kmp_team_t *team = thread->th.th_team;
3266 is_init_thread = TRUE;
3267 if (maxthreads < nthreads) {
3269 if (*threads_data_p != NULL) {
3270 kmp_thread_data_t *old_data = *threads_data_p;
3271 kmp_thread_data_t *new_data = NULL;
3275 (
"__kmp_realloc_task_threads_data: T#%d reallocating "
3276 "threads data for task_team %p, new_size = %d, old_size = %d\n",
3277 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3282 new_data = (kmp_thread_data_t *)__kmp_allocate(
3283 nthreads *
sizeof(kmp_thread_data_t));
3285 KMP_MEMCPY_S((
void *)new_data, nthreads *
sizeof(kmp_thread_data_t),
3286 (
void *)old_data, maxthreads *
sizeof(kmp_thread_data_t));
3288 #ifdef BUILD_TIED_TASK_STACK
3290 for (i = maxthreads; i < nthreads; i++) {
3291 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3292 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3294 #endif // BUILD_TIED_TASK_STACK
3296 (*threads_data_p) = new_data;
3297 __kmp_free(old_data);
3299 KE_TRACE(10, (
"__kmp_realloc_task_threads_data: T#%d allocating "
3300 "threads data for task_team %p, size = %d\n",
3301 __kmp_gtid_from_thread(thread), task_team, nthreads));
3305 ANNOTATE_IGNORE_WRITES_BEGIN();
3306 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3307 nthreads *
sizeof(kmp_thread_data_t));
3308 ANNOTATE_IGNORE_WRITES_END();
3309 #ifdef BUILD_TIED_TASK_STACK
3311 for (i = 0; i < nthreads; i++) {
3312 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3313 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3315 #endif // BUILD_TIED_TASK_STACK
3317 task_team->tt.tt_max_threads = nthreads;
3320 KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3324 for (i = 0; i < nthreads; i++) {
3325 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3326 thread_data->td.td_thr = team->t.t_threads[i];
3328 if (thread_data->td.td_deque_last_stolen >= nthreads) {
3332 thread_data->td.td_deque_last_stolen = -1;
3337 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3340 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3341 return is_init_thread;
3347 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3348 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3349 if (task_team->tt.tt_threads_data != NULL) {
3351 for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3352 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3354 __kmp_free(task_team->tt.tt_threads_data);
3355 task_team->tt.tt_threads_data = NULL;
3357 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3364 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3366 kmp_task_team_t *task_team = NULL;
3369 KA_TRACE(20, (
"__kmp_allocate_task_team: T#%d entering; team = %p\n",
3370 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3372 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3374 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3375 if (__kmp_free_task_teams != NULL) {
3376 task_team = __kmp_free_task_teams;
3377 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3378 task_team->tt.tt_next = NULL;
3380 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3383 if (task_team == NULL) {
3384 KE_TRACE(10, (
"__kmp_allocate_task_team: T#%d allocating "
3385 "task team for team %p\n",
3386 __kmp_gtid_from_thread(thread), team));
3390 task_team = (kmp_task_team_t *)__kmp_allocate(
sizeof(kmp_task_team_t));
3391 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3398 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3400 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3402 task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3404 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3405 TCW_4(task_team->tt.tt_active, TRUE);
3407 KA_TRACE(20, (
"__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3408 "unfinished_threads init'd to %d\n",
3409 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3410 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3417 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3418 KA_TRACE(20, (
"__kmp_free_task_team: T#%d task_team = %p\n",
3419 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3422 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3424 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3425 task_team->tt.tt_next = __kmp_free_task_teams;
3426 TCW_PTR(__kmp_free_task_teams, task_team);
3428 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3436 void __kmp_reap_task_teams(
void) {
3437 kmp_task_team_t *task_team;
3439 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3441 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3442 while ((task_team = __kmp_free_task_teams) != NULL) {
3443 __kmp_free_task_teams = task_team->tt.tt_next;
3444 task_team->tt.tt_next = NULL;
3447 if (task_team->tt.tt_threads_data != NULL) {
3448 __kmp_free_task_threads_data(task_team);
3450 __kmp_free(task_team);
3452 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3459 void __kmp_wait_to_unref_task_teams(
void) {
3464 KMP_INIT_YIELD(spins);
3472 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3473 thread = thread->th.th_next_pool) {
3477 if (TCR_PTR(thread->th.th_task_team) == NULL) {
3478 KA_TRACE(10, (
"__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3479 __kmp_gtid_from_thread(thread)));
3484 if (!__kmp_is_thread_alive(thread, &exit_val)) {
3485 thread->th.th_task_team = NULL;
3492 KA_TRACE(10, (
"__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3493 "unreference task_team\n",
3494 __kmp_gtid_from_thread(thread)));
3496 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3497 volatile void *sleep_loc;
3499 if ((sleep_loc = TCR_PTR(CCAST(
void *, thread->th.th_sleep_loc))) !=
3503 (
"__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3504 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3505 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3514 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3520 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
int always) {
3521 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3527 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3528 (always || team->t.t_nproc > 1)) {
3529 team->t.t_task_team[this_thr->th.th_task_state] =
3530 __kmp_allocate_task_team(this_thr, team);
3531 KA_TRACE(20, (
"__kmp_task_team_setup: Master T#%d created new task_team %p "
3532 "for team %d at parity=%d\n",
3533 __kmp_gtid_from_thread(this_thr),
3534 team->t.t_task_team[this_thr->th.th_task_state],
3535 ((team != NULL) ? team->t.t_id : -1),
3536 this_thr->th.th_task_state));
3546 if (team->t.t_nproc > 1) {
3547 int other_team = 1 - this_thr->th.th_task_state;
3548 if (team->t.t_task_team[other_team] == NULL) {
3549 team->t.t_task_team[other_team] =
3550 __kmp_allocate_task_team(this_thr, team);
3551 KA_TRACE(20, (
"__kmp_task_team_setup: Master T#%d created second new "
3552 "task_team %p for team %d at parity=%d\n",
3553 __kmp_gtid_from_thread(this_thr),
3554 team->t.t_task_team[other_team],
3555 ((team != NULL) ? team->t.t_id : -1), other_team));
3558 kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3559 if (!task_team->tt.tt_active ||
3560 team->t.t_nproc != task_team->tt.tt_nproc) {
3561 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3562 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3564 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3566 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3568 TCW_4(task_team->tt.tt_active, TRUE);
3572 KA_TRACE(20, (
"__kmp_task_team_setup: Master T#%d reset next task_team "
3573 "%p for team %d at parity=%d\n",
3574 __kmp_gtid_from_thread(this_thr),
3575 team->t.t_task_team[other_team],
3576 ((team != NULL) ? team->t.t_id : -1), other_team));
3584 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3585 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3589 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3592 TCW_PTR(this_thr->th.th_task_team,
3593 team->t.t_task_team[this_thr->th.th_task_state]);
3595 (
"__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3596 "%p from Team #%d (parity=%d)\n",
3597 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3598 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3608 void __kmp_task_team_wait(
3609 kmp_info_t *this_thr,
3610 kmp_team_t *team USE_ITT_BUILD_ARG(
void *itt_sync_obj),
int wait) {
3611 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3613 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3614 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3616 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3618 KA_TRACE(20, (
"__kmp_task_team_wait: Master T#%d waiting for all tasks "
3619 "(for unfinished_threads to reach 0) on task_team = %p\n",
3620 __kmp_gtid_from_thread(this_thr), task_team));
3624 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3625 &task_team->tt.tt_unfinished_threads),
3627 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3633 (
"__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3634 "setting active to false, setting local and team's pointer to NULL\n",
3635 __kmp_gtid_from_thread(this_thr), task_team));
3637 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3638 task_team->tt.tt_found_proxy_tasks == TRUE);
3639 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3641 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
3643 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3644 TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3647 TCW_PTR(this_thr->th.th_task_team, NULL);
3656 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
int gtid) {
3657 std::atomic<kmp_uint32> *spin = RCAST(
3658 std::atomic<kmp_uint32> *,
3659 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3661 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3664 KMP_FSYNC_SPIN_INIT(spin, NULL);
3666 kmp_flag_32 spin_flag(spin, 0U);
3667 while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3668 &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3671 KMP_FSYNC_SPIN_PREPARE(RCAST(
void *, spin));
3674 if (TCR_4(__kmp_global.g.g_done)) {
3675 if (__kmp_global.g.g_abort)
3676 __kmp_abort_thread();
3682 KMP_FSYNC_SPIN_ACQUIRED(RCAST(
void *, spin));
3693 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3695 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3696 kmp_task_team_t *task_team = taskdata->td_task_team;
3698 KA_TRACE(20, (
"__kmp_give_task: trying to give task %p to thread %d.\n",
3702 KMP_DEBUG_ASSERT(task_team != NULL);
3704 bool result =
false;
3705 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3707 if (thread_data->td.td_deque == NULL) {
3711 (
"__kmp_give_task: thread %d has no queue while giving task %p.\n",
3716 if (TCR_4(thread_data->td.td_deque_ntasks) >=
3717 TASK_DEQUE_SIZE(thread_data->td)) {
3720 (
"__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3725 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3728 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3729 __kmp_realloc_task_deque(thread, thread_data);
3733 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3735 if (TCR_4(thread_data->td.td_deque_ntasks) >=
3736 TASK_DEQUE_SIZE(thread_data->td)) {
3737 KA_TRACE(30, (
"__kmp_give_task: queue is full while giving task %p to "
3743 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3744 goto release_and_exit;
3746 __kmp_realloc_task_deque(thread, thread_data);
3752 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3754 thread_data->td.td_deque_tail =
3755 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3756 TCW_4(thread_data->td.td_deque_ntasks,
3757 TCR_4(thread_data->td.td_deque_ntasks) + 1);
3760 KA_TRACE(30, (
"__kmp_give_task: successfully gave task %p to thread %d.\n",
3764 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3785 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3786 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3787 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3788 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3789 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3791 taskdata->td_flags.complete = 1;
3793 if (taskdata->td_taskgroup)
3794 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3798 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3801 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3802 kmp_int32 children = 0;
3806 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3807 KMP_DEBUG_ASSERT(children >= 0);
3810 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3813 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3814 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3815 kmp_info_t *thread = __kmp_threads[gtid];
3817 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3818 KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3823 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3826 __kmp_release_deps(gtid, taskdata);
3827 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3838 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3839 KMP_DEBUG_ASSERT(ptask != NULL);
3840 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3842 10, (
"__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3845 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3847 __kmp_first_top_half_finish_proxy(taskdata);
3848 __kmp_second_top_half_finish_proxy(taskdata);
3849 __kmp_bottom_half_finish_proxy(gtid, ptask);
3852 (
"__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3863 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3864 KMP_DEBUG_ASSERT(ptask != NULL);
3865 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3869 (
"__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3872 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3874 __kmp_first_top_half_finish_proxy(taskdata);
3878 kmp_team_t *team = taskdata->td_team;
3879 kmp_int32 nthreads = team->t.t_nproc;
3884 kmp_int32 start_k = 0;
3886 kmp_int32 k = start_k;
3890 thread = team->t.t_threads[k];
3891 k = (k + 1) % nthreads;
3897 }
while (!__kmp_give_task(thread, k, ptask, pass));
3899 __kmp_second_top_half_finish_proxy(taskdata);
3903 (
"__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3913 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3915 kmp_taskdata_t *taskdata;
3916 kmp_taskdata_t *taskdata_src;
3917 kmp_taskdata_t *parent_task = thread->th.th_current_task;
3918 size_t shareds_offset;
3921 KA_TRACE(10, (
"__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3923 taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3924 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3926 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3927 task_size = taskdata_src->td_size_alloc;
3930 KA_TRACE(30, (
"__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3933 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3935 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3937 KMP_MEMCPY(taskdata, taskdata_src, task_size);
3939 task = KMP_TASKDATA_TO_TASK(taskdata);
3942 taskdata->td_task_id = KMP_GEN_TASK_ID();
3943 if (task->shareds != NULL) {
3944 shareds_offset = (
char *)task_src->shareds - (
char *)taskdata_src;
3945 task->shareds = &((
char *)taskdata)[shareds_offset];
3946 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (
sizeof(
void *) - 1)) ==
3949 taskdata->td_alloc_thread = thread;
3950 taskdata->td_parent = parent_task;
3951 taskdata->td_taskgroup =
3957 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3958 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3959 if (parent_task->td_taskgroup)
3960 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3963 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3964 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3968 (
"__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3969 thread, taskdata, taskdata->td_parent));
3971 if (UNLIKELY(ompt_enabled.enabled))
3972 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
3981 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3983 KMP_BUILD_ASSERT(
sizeof(
long) == 4 ||
sizeof(
long) == 8);
3988 class kmp_taskloop_bounds_t {
3990 const kmp_taskdata_t *taskdata;
3991 size_t lower_offset;
3992 size_t upper_offset;
3995 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
3996 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
3997 lower_offset((char *)lb - (char *)task),
3998 upper_offset((char *)ub - (char *)task) {
3999 KMP_DEBUG_ASSERT((
char *)lb > (
char *)_task);
4000 KMP_DEBUG_ASSERT((
char *)ub > (
char *)_task);
4002 kmp_taskloop_bounds_t(kmp_task_t *_task,
const kmp_taskloop_bounds_t &bounds)
4003 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4004 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4005 size_t get_lower_offset()
const {
return lower_offset; }
4006 size_t get_upper_offset()
const {
return upper_offset; }
4007 kmp_uint64 get_lb()
const {
4009 #if defined(KMP_GOMP_COMPAT)
4011 if (!taskdata->td_flags.native) {
4012 retval = *(kmp_int64 *)((
char *)task + lower_offset);
4015 if (taskdata->td_size_loop_bounds == 4) {
4016 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4017 retval = (kmp_int64)*lb;
4019 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4020 retval = (kmp_int64)*lb;
4024 retval = *(kmp_int64 *)((
char *)task + lower_offset);
4025 #endif // defined(KMP_GOMP_COMPAT)
4028 kmp_uint64 get_ub()
const {
4030 #if defined(KMP_GOMP_COMPAT)
4032 if (!taskdata->td_flags.native) {
4033 retval = *(kmp_int64 *)((
char *)task + upper_offset);
4036 if (taskdata->td_size_loop_bounds == 4) {
4037 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4038 retval = (kmp_int64)*ub;
4040 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4041 retval = (kmp_int64)*ub;
4045 retval = *(kmp_int64 *)((
char *)task + upper_offset);
4046 #endif // defined(KMP_GOMP_COMPAT)
4049 void set_lb(kmp_uint64 lb) {
4050 #if defined(KMP_GOMP_COMPAT)
4052 if (!taskdata->td_flags.native) {
4053 *(kmp_uint64 *)((
char *)task + lower_offset) = lb;
4056 if (taskdata->td_size_loop_bounds == 4) {
4057 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4058 *lower = (kmp_uint32)lb;
4060 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4061 *lower = (kmp_uint64)lb;
4065 *(kmp_uint64 *)((
char *)task + lower_offset) = lb;
4066 #endif // defined(KMP_GOMP_COMPAT)
4068 void set_ub(kmp_uint64 ub) {
4069 #if defined(KMP_GOMP_COMPAT)
4071 if (!taskdata->td_flags.native) {
4072 *(kmp_uint64 *)((
char *)task + upper_offset) = ub;
4075 if (taskdata->td_size_loop_bounds == 4) {
4076 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4077 *upper = (kmp_uint32)ub;
4079 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4080 *upper = (kmp_uint64)ub;
4084 *(kmp_uint64 *)((
char *)task + upper_offset) = ub;
4085 #endif // defined(KMP_GOMP_COMPAT)
4104 void __kmp_taskloop_linear(
ident_t *loc,
int gtid, kmp_task_t *task,
4105 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4106 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4107 kmp_uint64 grainsize, kmp_uint64 extras,
4114 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4115 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4117 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4118 kmp_uint64 lower = task_bounds.get_lb();
4119 kmp_uint64 upper = task_bounds.get_ub();
4121 kmp_info_t *thread = __kmp_threads[gtid];
4122 kmp_taskdata_t *current_task = thread->th.th_current_task;
4123 kmp_task_t *next_task;
4124 kmp_int32 lastpriv = 0;
4126 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4127 KMP_DEBUG_ASSERT(num_tasks > extras);
4128 KMP_DEBUG_ASSERT(num_tasks > 0);
4129 KA_TRACE(20, (
"__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4130 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4131 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4135 for (i = 0; i < num_tasks; ++i) {
4136 kmp_uint64 chunk_minus_1;
4138 chunk_minus_1 = grainsize - 1;
4140 chunk_minus_1 = grainsize;
4143 upper = lower + st * chunk_minus_1;
4144 if (i == num_tasks - 1) {
4147 KMP_DEBUG_ASSERT(upper == *ub);
4148 if (upper == ub_glob)
4150 }
else if (st > 0) {
4151 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4152 if ((kmp_uint64)st > ub_glob - upper)
4155 KMP_DEBUG_ASSERT(upper + st < *ub);
4156 if (upper - ub_glob < (kmp_uint64)(-st))
4160 next_task = __kmp_task_dup_alloc(thread, task);
4161 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4162 kmp_taskloop_bounds_t next_task_bounds =
4163 kmp_taskloop_bounds_t(next_task, task_bounds);
4166 next_task_bounds.set_lb(lower);
4167 if (next_taskdata->td_flags.native) {
4168 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4170 next_task_bounds.set_ub(upper);
4172 if (ptask_dup != NULL)
4173 ptask_dup(next_task, task, lastpriv);
4175 (
"__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4176 "upper %lld stride %lld, (offsets %p %p)\n",
4177 gtid, i, next_task, lower, upper, st,
4178 next_task_bounds.get_lower_offset(),
4179 next_task_bounds.get_upper_offset()));
4181 __kmp_omp_taskloop_task(NULL, gtid, next_task,
4184 __kmp_omp_task(gtid, next_task,
true);
4189 __kmp_task_start(gtid, task, current_task);
4191 __kmp_task_finish<false>(gtid, task, current_task);
4196 typedef struct __taskloop_params {
4203 kmp_uint64 num_tasks;
4204 kmp_uint64 grainsize;
4207 kmp_uint64 num_t_min;
4211 } __taskloop_params_t;
4213 void __kmp_taskloop_recur(
ident_t *,
int, kmp_task_t *, kmp_uint64 *,
4214 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4215 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4222 int __kmp_taskloop_task(
int gtid,
void *ptask) {
4223 __taskloop_params_t *p =
4224 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4225 kmp_task_t *task = p->task;
4226 kmp_uint64 *lb = p->lb;
4227 kmp_uint64 *ub = p->ub;
4228 void *task_dup = p->task_dup;
4230 kmp_int64 st = p->st;
4231 kmp_uint64 ub_glob = p->ub_glob;
4232 kmp_uint64 num_tasks = p->num_tasks;
4233 kmp_uint64 grainsize = p->grainsize;
4234 kmp_uint64 extras = p->extras;
4235 kmp_uint64 tc = p->tc;
4236 kmp_uint64 num_t_min = p->num_t_min;
4238 void *codeptr_ra = p->codeptr_ra;
4241 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4242 KMP_DEBUG_ASSERT(task != NULL);
4243 KA_TRACE(20, (
"__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4244 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4245 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4248 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4249 if (num_tasks > num_t_min)
4250 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4251 grainsize, extras, tc, num_t_min,
4257 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4258 grainsize, extras, tc,
4264 KA_TRACE(40, (
"__kmp_taskloop_task(exit): T#%d\n", gtid));
4285 void __kmp_taskloop_recur(
ident_t *loc,
int gtid, kmp_task_t *task,
4286 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4287 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4288 kmp_uint64 grainsize, kmp_uint64 extras,
4289 kmp_uint64 tc, kmp_uint64 num_t_min,
4295 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4296 KMP_DEBUG_ASSERT(task != NULL);
4297 KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4298 KA_TRACE(20, (
"__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4299 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4300 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4303 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4304 kmp_uint64 lower = *lb;
4305 kmp_info_t *thread = __kmp_threads[gtid];
4307 kmp_task_t *next_task;
4308 size_t lower_offset =
4309 (
char *)lb - (
char *)task;
4310 size_t upper_offset =
4311 (
char *)ub - (
char *)task;
4313 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4314 KMP_DEBUG_ASSERT(num_tasks > extras);
4315 KMP_DEBUG_ASSERT(num_tasks > 0);
4318 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4319 kmp_uint64 gr_size0 = grainsize;
4320 kmp_uint64 n_tsk0 = num_tasks >> 1;
4321 kmp_uint64 n_tsk1 = num_tasks - n_tsk0;
4322 if (n_tsk0 <= extras) {
4325 ext1 = extras - n_tsk0;
4326 tc0 = gr_size0 * n_tsk0;
4331 tc1 = grainsize * n_tsk1;
4334 ub0 = lower + st * (tc0 - 1);
4338 next_task = __kmp_task_dup_alloc(thread, task);
4340 *(kmp_uint64 *)((
char *)next_task + lower_offset) = lb1;
4341 if (ptask_dup != NULL)
4342 ptask_dup(next_task, task, 0);
4346 kmp_task_t *new_task =
4347 __kmpc_omp_task_alloc(loc, gtid, 1, 3 *
sizeof(
void *),
4348 sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4349 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4350 p->task = next_task;
4351 p->lb = (kmp_uint64 *)((
char *)next_task + lower_offset);
4352 p->ub = (kmp_uint64 *)((
char *)next_task + upper_offset);
4353 p->task_dup = task_dup;
4355 p->ub_glob = ub_glob;
4356 p->num_tasks = n_tsk1;
4357 p->grainsize = grainsize;
4360 p->num_t_min = num_t_min;
4362 p->codeptr_ra = codeptr_ra;
4367 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4369 __kmp_omp_task(gtid, new_task,
true);
4373 if (n_tsk0 > num_t_min)
4374 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4375 ext0, tc0, num_t_min,
4381 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4382 gr_size0, ext0, tc0,
4388 KA_TRACE(40, (
"__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4407 void __kmpc_taskloop(
ident_t *loc,
int gtid, kmp_task_t *task,
int if_val,
4408 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
int nogroup,
4409 int sched, kmp_uint64 grainsize,
void *task_dup) {
4410 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4411 KMP_DEBUG_ASSERT(task != NULL);
4414 #if OMPT_SUPPORT && OMPT_OPTIONAL
4415 OMPT_STORE_RETURN_ADDRESS(gtid);
4417 __kmpc_taskgroup(loc, gtid);
4422 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4425 kmp_uint64 lower = task_bounds.get_lb();
4426 kmp_uint64 upper = task_bounds.get_ub();
4427 kmp_uint64 ub_glob = upper;
4428 kmp_uint64 num_tasks = 0, extras = 0;
4429 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4430 kmp_info_t *thread = __kmp_threads[gtid];
4431 kmp_taskdata_t *current_task = thread->th.th_current_task;
4433 KA_TRACE(20, (
"__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4434 "grain %llu(%d), dup %p\n",
4435 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4439 tc = upper - lower + 1;
4440 }
else if (st < 0) {
4441 tc = (lower - upper) / (-st) + 1;
4443 tc = (upper - lower) / st + 1;
4446 KA_TRACE(20, (
"__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4448 __kmp_task_start(gtid, task, current_task);
4450 __kmp_task_finish<false>(gtid, task, current_task);
4454 #if OMPT_SUPPORT && OMPT_OPTIONAL
4455 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4456 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4457 if (ompt_enabled.ompt_callback_work) {
4458 ompt_callbacks.ompt_callback(ompt_callback_work)(
4459 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4460 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4464 if (num_tasks_min == 0)
4467 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4473 grainsize = thread->th.th_team_nproc * 10;
4476 if (grainsize > tc) {
4481 num_tasks = grainsize;
4482 grainsize = tc / num_tasks;
4483 extras = tc % num_tasks;
4487 if (grainsize > tc) {
4492 num_tasks = tc / grainsize;
4494 grainsize = tc / num_tasks;
4495 extras = tc % num_tasks;
4499 KMP_ASSERT2(0,
"unknown scheduling of taskloop");
4501 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4502 KMP_DEBUG_ASSERT(num_tasks > extras);
4503 KMP_DEBUG_ASSERT(num_tasks > 0);
4509 taskdata->td_flags.task_serial = 1;
4510 taskdata->td_flags.tiedness = TASK_TIED;
4512 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4513 grainsize, extras, tc,
4515 OMPT_GET_RETURN_ADDRESS(0),
4520 }
else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4521 KA_TRACE(20, (
"__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4522 "(%lld), grain %llu, extras %llu\n",
4523 gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4524 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4525 grainsize, extras, tc, num_tasks_min,
4527 OMPT_GET_RETURN_ADDRESS(0),
4531 KA_TRACE(20, (
"__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4532 "(%lld), grain %llu, extras %llu\n",
4533 gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4534 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4535 grainsize, extras, tc,
4537 OMPT_GET_RETURN_ADDRESS(0),
4542 #if OMPT_SUPPORT && OMPT_OPTIONAL
4543 if (ompt_enabled.ompt_callback_work) {
4544 ompt_callbacks.ompt_callback(ompt_callback_work)(
4545 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4546 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4551 #if OMPT_SUPPORT && OMPT_OPTIONAL
4552 OMPT_STORE_RETURN_ADDRESS(gtid);
4554 __kmpc_end_taskgroup(loc, gtid);
4556 KA_TRACE(20, (
"__kmpc_taskloop(exit): T#%d\n", gtid));
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).