LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_stats.h"
20 #include "kmp_wait_release.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* forward declaration */
29 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
30  kmp_info_t *this_thr);
31 static void __kmp_alloc_task_deque(kmp_info_t *thread,
32  kmp_thread_data_t *thread_data);
33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
34  kmp_task_team_t *task_team);
35 
36 #ifdef OMP_45_ENABLED
37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
38 #endif
39 
40 #ifdef BUILD_TIED_TASK_STACK
41 
42 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
43 // from top do bottom
44 //
45 // gtid: global thread identifier for thread containing stack
46 // thread_data: thread data for task team thread containing stack
47 // threshold: value above which the trace statement triggers
48 // location: string identifying call site of this function (for trace)
49 static void __kmp_trace_task_stack(kmp_int32 gtid,
50  kmp_thread_data_t *thread_data,
51  int threshold, char *location) {
52  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
53  kmp_taskdata_t **stack_top = task_stack->ts_top;
54  kmp_int32 entries = task_stack->ts_entries;
55  kmp_taskdata_t *tied_task;
56 
57  KA_TRACE(
58  threshold,
59  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
60  "first_block = %p, stack_top = %p \n",
61  location, gtid, entries, task_stack->ts_first_block, stack_top));
62 
63  KMP_DEBUG_ASSERT(stack_top != NULL);
64  KMP_DEBUG_ASSERT(entries > 0);
65 
66  while (entries != 0) {
67  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
68  // fix up ts_top if we need to pop from previous block
69  if (entries & TASK_STACK_INDEX_MASK == 0) {
70  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
71 
72  stack_block = stack_block->sb_prev;
73  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
74  }
75 
76  // finish bookkeeping
77  stack_top--;
78  entries--;
79 
80  tied_task = *stack_top;
81 
82  KMP_DEBUG_ASSERT(tied_task != NULL);
83  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
84 
85  KA_TRACE(threshold,
86  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
87  "stack_top=%p, tied_task=%p\n",
88  location, gtid, entries, stack_top, tied_task));
89  }
90  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
91 
92  KA_TRACE(threshold,
93  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
94  location, gtid));
95 }
96 
97 // __kmp_init_task_stack: initialize the task stack for the first time
98 // after a thread_data structure is created.
99 // It should not be necessary to do this again (assuming the stack works).
100 //
101 // gtid: global thread identifier of calling thread
102 // thread_data: thread data for task team thread containing stack
103 static void __kmp_init_task_stack(kmp_int32 gtid,
104  kmp_thread_data_t *thread_data) {
105  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
106  kmp_stack_block_t *first_block;
107 
108  // set up the first block of the stack
109  first_block = &task_stack->ts_first_block;
110  task_stack->ts_top = (kmp_taskdata_t **)first_block;
111  memset((void *)first_block, '\0',
112  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
113 
114  // initialize the stack to be empty
115  task_stack->ts_entries = TASK_STACK_EMPTY;
116  first_block->sb_next = NULL;
117  first_block->sb_prev = NULL;
118 }
119 
120 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
121 //
122 // gtid: global thread identifier for calling thread
123 // thread_data: thread info for thread containing stack
124 static void __kmp_free_task_stack(kmp_int32 gtid,
125  kmp_thread_data_t *thread_data) {
126  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
127  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
128 
129  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
130  // free from the second block of the stack
131  while (stack_block != NULL) {
132  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
133 
134  stack_block->sb_next = NULL;
135  stack_block->sb_prev = NULL;
136  if (stack_block != &task_stack->ts_first_block) {
137  __kmp_thread_free(thread,
138  stack_block); // free the block, if not the first
139  }
140  stack_block = next_block;
141  }
142  // initialize the stack to be empty
143  task_stack->ts_entries = 0;
144  task_stack->ts_top = NULL;
145 }
146 
147 // __kmp_push_task_stack: Push the tied task onto the task stack.
148 // Grow the stack if necessary by allocating another block.
149 //
150 // gtid: global thread identifier for calling thread
151 // thread: thread info for thread containing stack
152 // tied_task: the task to push on the stack
153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
154  kmp_taskdata_t *tied_task) {
155  // GEH - need to consider what to do if tt_threads_data not allocated yet
156  kmp_thread_data_t *thread_data =
157  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
158  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
159 
160  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
161  return; // Don't push anything on stack if team or team tasks are serialized
162  }
163 
164  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
165  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
166 
167  KA_TRACE(20,
168  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
169  gtid, thread, tied_task));
170  // Store entry
171  *(task_stack->ts_top) = tied_task;
172 
173  // Do bookkeeping for next push
174  task_stack->ts_top++;
175  task_stack->ts_entries++;
176 
177  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
178  // Find beginning of this task block
179  kmp_stack_block_t *stack_block =
180  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
181 
182  // Check if we already have a block
183  if (stack_block->sb_next !=
184  NULL) { // reset ts_top to beginning of next block
185  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
186  } else { // Alloc new block and link it up
187  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
188  thread, sizeof(kmp_stack_block_t));
189 
190  task_stack->ts_top = &new_block->sb_block[0];
191  stack_block->sb_next = new_block;
192  new_block->sb_prev = stack_block;
193  new_block->sb_next = NULL;
194 
195  KA_TRACE(
196  30,
197  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
198  gtid, tied_task, new_block));
199  }
200  }
201  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
202  tied_task));
203 }
204 
205 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
206 // the task, just check to make sure it matches the ending task passed in.
207 //
208 // gtid: global thread identifier for the calling thread
209 // thread: thread info structure containing stack
210 // tied_task: the task popped off the stack
211 // ending_task: the task that is ending (should match popped task)
212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
213  kmp_taskdata_t *ending_task) {
214  // GEH - need to consider what to do if tt_threads_data not allocated yet
215  kmp_thread_data_t *thread_data =
216  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
217  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
218  kmp_taskdata_t *tied_task;
219 
220  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
221  // Don't pop anything from stack if team or team tasks are serialized
222  return;
223  }
224 
225  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
226  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
227 
228  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
229  thread));
230 
231  // fix up ts_top if we need to pop from previous block
232  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
233  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
234 
235  stack_block = stack_block->sb_prev;
236  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
237  }
238 
239  // finish bookkeeping
240  task_stack->ts_top--;
241  task_stack->ts_entries--;
242 
243  tied_task = *(task_stack->ts_top);
244 
245  KMP_DEBUG_ASSERT(tied_task != NULL);
246  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
247  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
248 
249  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
250  tied_task));
251  return;
252 }
253 #endif /* BUILD_TIED_TASK_STACK */
254 
255 // __kmp_push_task: Add a task to the thread's deque
256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
257  kmp_info_t *thread = __kmp_threads[gtid];
258  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
259  kmp_task_team_t *task_team = thread->th.th_task_team;
260  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
261  kmp_thread_data_t *thread_data;
262 
263  KA_TRACE(20,
264  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
265 
266  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
267  // untied task needs to increment counter so that the task structure is not
268  // freed prematurely
269  kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
270  KA_TRACE(
271  20,
272  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
273  gtid, counter, taskdata));
274  }
275 
276  // The first check avoids building task_team thread data if serialized
277  if (taskdata->td_flags.task_serial) {
278  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
279  "TASK_NOT_PUSHED for task %p\n",
280  gtid, taskdata));
281  return TASK_NOT_PUSHED;
282  }
283 
284  // Now that serialized tasks have returned, we can assume that we are not in
285  // immediate exec mode
286  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
287  if (!KMP_TASKING_ENABLED(task_team)) {
288  __kmp_enable_tasking(task_team, thread);
289  }
290  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
291  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
292 
293  // Find tasking deque specific to encountering thread
294  thread_data = &task_team->tt.tt_threads_data[tid];
295 
296  // No lock needed since only owner can allocate
297  if (thread_data->td.td_deque == NULL) {
298  __kmp_alloc_task_deque(thread, thread_data);
299  }
300 
301  // Check if deque is full
302  if (TCR_4(thread_data->td.td_deque_ntasks) >=
303  TASK_DEQUE_SIZE(thread_data->td)) {
304  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
305  "TASK_NOT_PUSHED for task %p\n",
306  gtid, taskdata));
307  return TASK_NOT_PUSHED;
308  }
309 
310  // Lock the deque for the task push operation
311  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
312 
313 #if OMP_45_ENABLED
314  // Need to recheck as we can get a proxy task from a thread outside of OpenMP
315  if (TCR_4(thread_data->td.td_deque_ntasks) >=
316  TASK_DEQUE_SIZE(thread_data->td)) {
317  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
318  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning "
319  "TASK_NOT_PUSHED for task %p\n",
320  gtid, taskdata));
321  return TASK_NOT_PUSHED;
322  }
323 #else
324  // Must have room since no thread can add tasks but calling thread
325  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
326  TASK_DEQUE_SIZE(thread_data->td));
327 #endif
328 
329  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
330  taskdata; // Push taskdata
331  // Wrap index.
332  thread_data->td.td_deque_tail =
333  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
334  TCW_4(thread_data->td.td_deque_ntasks,
335  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
336 
337  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
338  "task=%p ntasks=%d head=%u tail=%u\n",
339  gtid, taskdata, thread_data->td.td_deque_ntasks,
340  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
341 
342  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
343 
344  return TASK_SUCCESSFULLY_PUSHED;
345 }
346 
347 // __kmp_pop_current_task_from_thread: set up current task from called thread
348 // when team ends
349 //
350 // this_thr: thread structure to set current_task in.
351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
352  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
353  "this_thread=%p, curtask=%p, "
354  "curtask_parent=%p\n",
355  0, this_thr, this_thr->th.th_current_task,
356  this_thr->th.th_current_task->td_parent));
357 
358  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
359 
360  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
361  "this_thread=%p, curtask=%p, "
362  "curtask_parent=%p\n",
363  0, this_thr, this_thr->th.th_current_task,
364  this_thr->th.th_current_task->td_parent));
365 }
366 
367 // __kmp_push_current_task_to_thread: set up current task in called thread for a
368 // new team
369 //
370 // this_thr: thread structure to set up
371 // team: team for implicit task data
372 // tid: thread within team to set up
373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
374  int tid) {
375  // current task of the thread is a parent of the new just created implicit
376  // tasks of new team
377  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
378  "curtask=%p "
379  "parent_task=%p\n",
380  tid, this_thr, this_thr->th.th_current_task,
381  team->t.t_implicit_task_taskdata[tid].td_parent));
382 
383  KMP_DEBUG_ASSERT(this_thr != NULL);
384 
385  if (tid == 0) {
386  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
387  team->t.t_implicit_task_taskdata[0].td_parent =
388  this_thr->th.th_current_task;
389  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
390  }
391  } else {
392  team->t.t_implicit_task_taskdata[tid].td_parent =
393  team->t.t_implicit_task_taskdata[0].td_parent;
394  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
395  }
396 
397  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
398  "curtask=%p "
399  "parent_task=%p\n",
400  tid, this_thr, this_thr->th.th_current_task,
401  team->t.t_implicit_task_taskdata[tid].td_parent));
402 }
403 
404 // __kmp_task_start: bookkeeping for a task starting execution
405 //
406 // GTID: global thread id of calling thread
407 // task: task starting execution
408 // current_task: task suspending
409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
410  kmp_taskdata_t *current_task) {
411  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
412  kmp_info_t *thread = __kmp_threads[gtid];
413 
414  KA_TRACE(10,
415  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
416  gtid, taskdata, current_task));
417 
418  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
419 
420  // mark currently executing task as suspended
421  // TODO: GEH - make sure root team implicit task is initialized properly.
422  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
423  current_task->td_flags.executing = 0;
424 
425 // Add task to stack if tied
426 #ifdef BUILD_TIED_TASK_STACK
427  if (taskdata->td_flags.tiedness == TASK_TIED) {
428  __kmp_push_task_stack(gtid, thread, taskdata);
429  }
430 #endif /* BUILD_TIED_TASK_STACK */
431 
432  // mark starting task as executing and as current task
433  thread->th.th_current_task = taskdata;
434 
435  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
436  taskdata->td_flags.tiedness == TASK_UNTIED);
437  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
438  taskdata->td_flags.tiedness == TASK_UNTIED);
439  taskdata->td_flags.started = 1;
440  taskdata->td_flags.executing = 1;
441  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
442  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
443 
444  // GEH TODO: shouldn't we pass some sort of location identifier here?
445  // APT: yes, we will pass location here.
446  // need to store current thread state (in a thread or taskdata structure)
447  // before setting work_state, otherwise wrong state is set after end of task
448 
449  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
450 
451 #if OMPT_SUPPORT
452  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
453  kmp_taskdata_t *parent = taskdata->td_parent;
454  ompt_callbacks.ompt_callback(ompt_event_task_begin)(
455  parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
456  parent ? &(parent->ompt_task_info.frame) : NULL,
457  taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function);
458  }
459 #endif
460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
461  /* OMPT emit all dependences if requested by the tool */
462  if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
463  ompt_callbacks.ompt_callback(ompt_event_task_dependences)) {
464  ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
465  taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps,
466  taskdata->ompt_task_info.ndeps);
467  /* We can now free the allocated memory for the dependencies */
468  KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps);
469  taskdata->ompt_task_info.deps = NULL;
470  taskdata->ompt_task_info.ndeps = 0;
471  }
472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
473 
474  return;
475 }
476 
477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
478 // execution
479 //
480 // loc_ref: source location information; points to beginning of task block.
481 // gtid: global thread number.
482 // task: task thunk for the started task.
483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
484  kmp_task_t *task) {
485  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
486  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
487 
488  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
489  "current_task=%p\n",
490  gtid, loc_ref, taskdata, current_task));
491 
492  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
493  // untied task needs to increment counter so that the task structure is not
494  // freed prematurely
495  kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
496  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
497  "incremented for task %p\n",
498  gtid, counter, taskdata));
499  }
500 
501  taskdata->td_flags.task_serial =
502  1; // Execute this task immediately, not deferred.
503  __kmp_task_start(gtid, task, current_task);
504 
505  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
506  loc_ref, taskdata));
507 
508  return;
509 }
510 
511 #ifdef TASK_UNUSED
512 // __kmpc_omp_task_begin: report that a given task has started execution
513 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
515  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
516 
517  KA_TRACE(
518  10,
519  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
520  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
521 
522  __kmp_task_start(gtid, task, current_task);
523 
524  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
525  loc_ref, KMP_TASK_TO_TASKDATA(task)));
526  return;
527 }
528 #endif // TASK_UNUSED
529 
530 // __kmp_free_task: free the current task space and the space for shareds
531 //
532 // gtid: Global thread ID of calling thread
533 // taskdata: task to free
534 // thread: thread data structure of caller
535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
536  kmp_info_t *thread) {
537  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
538  taskdata));
539 
540  // Check to make sure all flags and counters have the correct values
541  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
542  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
543  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
544  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
545  KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 ||
546  taskdata->td_flags.task_serial == 1);
547  KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0);
548 
549  taskdata->td_flags.freed = 1;
550  ANNOTATE_HAPPENS_BEFORE(taskdata);
551 // deallocate the taskdata and shared variable blocks associated with this task
552 #if USE_FAST_MEMORY
553  __kmp_fast_free(thread, taskdata);
554 #else /* ! USE_FAST_MEMORY */
555  __kmp_thread_free(thread, taskdata);
556 #endif
557 
558  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
559 }
560 
561 // __kmp_free_task_and_ancestors: free the current task and ancestors without
562 // children
563 //
564 // gtid: Global thread ID of calling thread
565 // taskdata: task to free
566 // thread: thread data structure of caller
567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
568  kmp_taskdata_t *taskdata,
569  kmp_info_t *thread) {
570 #if OMP_45_ENABLED
571  // Proxy tasks must always be allowed to free their parents
572  // because they can be run in background even in serial mode.
573  kmp_int32 team_serial =
574  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
575  !taskdata->td_flags.proxy;
576 #else
577  kmp_int32 team_serial =
578  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser;
579 #endif
580  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
581 
582  kmp_int32 children =
583  KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
584  KMP_DEBUG_ASSERT(children >= 0);
585 
586  // Now, go up the ancestor tree to see if any ancestors can now be freed.
587  while (children == 0) {
588  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
589 
590  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
591  "and freeing itself\n",
592  gtid, taskdata));
593 
594  // --- Deallocate my ancestor task ---
595  __kmp_free_task(gtid, taskdata, thread);
596 
597  taskdata = parent_taskdata;
598 
599  // Stop checking ancestors at implicit task instead of walking up ancestor
600  // tree to avoid premature deallocation of ancestors.
601  if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT)
602  return;
603 
604  // Predecrement simulated by "- 1" calculation
605  children = KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
606  KMP_DEBUG_ASSERT(children >= 0);
607  }
608 
609  KA_TRACE(
610  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
611  "not freeing it yet\n",
612  gtid, taskdata, children));
613 }
614 
615 // __kmp_task_finish: bookkeeping to do when a task finishes execution
616 //
617 // gtid: global thread ID for calling thread
618 // task: task to be finished
619 // resumed_task: task to be resumed. (may be NULL if task is serialized)
620 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
621  kmp_taskdata_t *resumed_task) {
622  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
623  kmp_info_t *thread = __kmp_threads[gtid];
624  kmp_task_team_t *task_team =
625  thread->th.th_task_team; // might be NULL for serial teams...
626  kmp_int32 children = 0;
627 
628 #if OMPT_SUPPORT
629  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) {
630  kmp_taskdata_t *parent = taskdata->td_parent;
631  ompt_callbacks.ompt_callback(ompt_event_task_end)(
632  taskdata->ompt_task_info.task_id);
633  }
634 #endif
635 
636  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
637  "task %p\n",
638  gtid, taskdata, resumed_task));
639 
640  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
641 
642 // Pop task from stack if tied
643 #ifdef BUILD_TIED_TASK_STACK
644  if (taskdata->td_flags.tiedness == TASK_TIED) {
645  __kmp_pop_task_stack(gtid, thread, taskdata);
646  }
647 #endif /* BUILD_TIED_TASK_STACK */
648 
649  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
650  // untied task needs to check the counter so that the task structure is not
651  // freed prematurely
652  kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
653  KA_TRACE(
654  20,
655  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
656  gtid, counter, taskdata));
657  if (counter > 0) {
658  // untied task is not done, to be continued possibly by other thread, do
659  // not free it now
660  if (resumed_task == NULL) {
661  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
662  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
663  // task is the parent
664  }
665  thread->th.th_current_task = resumed_task; // restore current_task
666  resumed_task->td_flags.executing = 1; // resume previous task
667  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
668  "resuming task %p\n",
669  gtid, taskdata, resumed_task));
670  return;
671  }
672  }
673 
674  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
675  taskdata->td_flags.complete = 1; // mark the task as completed
676  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
677  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
678 
679  // Only need to keep track of count if team parallel and tasking not
680  // serialized
681  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
682  // Predecrement simulated by "- 1" calculation
683  children =
684  KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) -
685  1;
686  KMP_DEBUG_ASSERT(children >= 0);
687 #if OMP_40_ENABLED
688  if (taskdata->td_taskgroup)
689  KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
690 #if OMP_45_ENABLED
691  }
692  // if we found proxy tasks there could exist a dependency chain
693  // with the proxy task as origin
694  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
695  (task_team && task_team->tt.tt_found_proxy_tasks)) {
696 #endif
697  __kmp_release_deps(gtid, taskdata);
698 #endif
699  }
700 
701  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
702  // called. Othertwise, if a task is executed immediately from the release_deps
703  // code, the flag will be reset to 1 again by this same function
704  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
705  taskdata->td_flags.executing = 0; // suspend the finishing task
706 
707  KA_TRACE(
708  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
709  gtid, taskdata, children));
710 
711 #if OMP_40_ENABLED
712  /* If the tasks' destructor thunk flag has been set, we need to invoke the
713  destructor thunk that has been generated by the compiler. The code is
714  placed here, since at this point other tasks might have been released
715  hence overlapping the destructor invokations with some other work in the
716  released tasks. The OpenMP spec is not specific on when the destructors
717  are invoked, so we should be free to choose. */
718  if (taskdata->td_flags.destructors_thunk) {
719  kmp_routine_entry_t destr_thunk = task->data1.destructors;
720  KMP_ASSERT(destr_thunk);
721  destr_thunk(gtid, task);
722  }
723 #endif // OMP_40_ENABLED
724 
725  // bookkeeping for resuming task:
726  // GEH - note tasking_ser => task_serial
727  KMP_DEBUG_ASSERT(
728  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
729  taskdata->td_flags.task_serial);
730  if (taskdata->td_flags.task_serial) {
731  if (resumed_task == NULL) {
732  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
733  // task is the parent
734  } else
735 #if OMP_45_ENABLED
736  if (!(task_team && task_team->tt.tt_found_proxy_tasks))
737 #endif
738  {
739  // verify resumed task passed in points to parent
740  KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent);
741  }
742  } else {
743  KMP_DEBUG_ASSERT(resumed_task !=
744  NULL); // verify that resumed task is passed as arguemnt
745  }
746 
747  // Free this task and then ancestor tasks if they have no children.
748  // Restore th_current_task first as suggested by John:
749  // johnmc: if an asynchronous inquiry peers into the runtime system
750  // it doesn't see the freed task as the current task.
751  thread->th.th_current_task = resumed_task;
752  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
753 
754  // TODO: GEH - make sure root team implicit task is initialized properly.
755  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
756  resumed_task->td_flags.executing = 1; // resume previous task
757 
758  KA_TRACE(
759  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
760  gtid, taskdata, resumed_task));
761 
762  return;
763 }
764 
765 // __kmpc_omp_task_complete_if0: report that a task has completed execution
766 //
767 // loc_ref: source location information; points to end of task block.
768 // gtid: global thread number.
769 // task: task thunk for the completed task.
770 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
771  kmp_task_t *task) {
772  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
773  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
774  // this routine will provide task to resume
775  __kmp_task_finish(gtid, task, NULL);
776 
777  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
778  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
779  return;
780 }
781 
782 #ifdef TASK_UNUSED
783 // __kmpc_omp_task_complete: report that a task has completed execution
784 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
785 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
786  kmp_task_t *task) {
787  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
788  loc_ref, KMP_TASK_TO_TASKDATA(task)));
789 
790  __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume
791 
792  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
793  loc_ref, KMP_TASK_TO_TASKDATA(task)));
794  return;
795 }
796 #endif // TASK_UNUSED
797 
798 #if OMPT_SUPPORT
799 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will
800 // only be called after ompt_tool, so we already know whether ompt is enabled
801 // or not.
802 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid,
803  void *function) {
804  if (ompt_enabled) {
805  task->ompt_task_info.task_id = __ompt_task_id_new(tid);
806  task->ompt_task_info.function = function;
807  task->ompt_task_info.frame.exit_runtime_frame = NULL;
808  task->ompt_task_info.frame.reenter_runtime_frame = NULL;
809 #if OMP_40_ENABLED
810  task->ompt_task_info.ndeps = 0;
811  task->ompt_task_info.deps = NULL;
812 #endif /* OMP_40_ENABLED */
813  }
814 }
815 #endif
816 
817 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
818 // task for a given thread
819 //
820 // loc_ref: reference to source location of parallel region
821 // this_thr: thread data structure corresponding to implicit task
822 // team: team for this_thr
823 // tid: thread id of given thread within team
824 // set_curr_task: TRUE if need to push current task to thread
825 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
826 // have already been done elsewhere.
827 // TODO: Get better loc_ref. Value passed in may be NULL
828 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
829  kmp_team_t *team, int tid, int set_curr_task) {
830  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
831 
832  KF_TRACE(
833  10,
834  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
835  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
836 
837  task->td_task_id = KMP_GEN_TASK_ID();
838  task->td_team = team;
839  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
840  // in debugger)
841  task->td_ident = loc_ref;
842  task->td_taskwait_ident = NULL;
843  task->td_taskwait_counter = 0;
844  task->td_taskwait_thread = 0;
845 
846  task->td_flags.tiedness = TASK_TIED;
847  task->td_flags.tasktype = TASK_IMPLICIT;
848 #if OMP_45_ENABLED
849  task->td_flags.proxy = TASK_FULL;
850 #endif
851 
852  // All implicit tasks are executed immediately, not deferred
853  task->td_flags.task_serial = 1;
854  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
855  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
856 
857  task->td_flags.started = 1;
858  task->td_flags.executing = 1;
859  task->td_flags.complete = 0;
860  task->td_flags.freed = 0;
861 
862 #if OMP_40_ENABLED
863  task->td_depnode = NULL;
864 #endif
865 
866  if (set_curr_task) { // only do this init first time thread is created
867  task->td_incomplete_child_tasks = 0;
868  // Not used: don't need to deallocate implicit task
869  task->td_allocated_child_tasks = 0;
870 #if OMP_40_ENABLED
871  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
872  task->td_dephash = NULL;
873 #endif
874  __kmp_push_current_task_to_thread(this_thr, team, tid);
875  } else {
876  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
877  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
878  }
879 
880 #if OMPT_SUPPORT
881  __kmp_task_init_ompt(task, tid, NULL);
882 #endif
883 
884  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
885  team, task));
886 }
887 
888 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
889 // at the end of parallel regions. Some resources are kept for reuse in the next
890 // parallel region.
891 //
892 // thread: thread data structure corresponding to implicit task
893 void __kmp_finish_implicit_task(kmp_info_t *thread) {
894  kmp_taskdata_t *task = thread->th.th_current_task;
895  if (task->td_dephash)
896  __kmp_dephash_free_entries(thread, task->td_dephash);
897 }
898 
899 // __kmp_free_implicit_task: Release resources associated to implicit tasks
900 // when these are destroyed regions
901 //
902 // thread: thread data structure corresponding to implicit task
903 void __kmp_free_implicit_task(kmp_info_t *thread) {
904  kmp_taskdata_t *task = thread->th.th_current_task;
905  if (task->td_dephash)
906  __kmp_dephash_free(thread, task->td_dephash);
907  task->td_dephash = NULL;
908 }
909 
910 // Round up a size to a power of two specified by val: Used to insert padding
911 // between structures co-allocated using a single malloc() call
912 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
913  if (size & (val - 1)) {
914  size &= ~(val - 1);
915  if (size <= KMP_SIZE_T_MAX - val) {
916  size += val; // Round up if there is no overflow.
917  }; // if
918  }; // if
919  return size;
920 } // __kmp_round_up_to_va
921 
922 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
923 //
924 // loc_ref: source location information
925 // gtid: global thread number.
926 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
927 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
928 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
929 // private vars accessed in task.
930 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
931 // in task.
932 // task_entry: Pointer to task code entry point generated by compiler.
933 // returns: a pointer to the allocated kmp_task_t structure (task).
934 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
935  kmp_tasking_flags_t *flags,
936  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
937  kmp_routine_entry_t task_entry) {
938  kmp_task_t *task;
939  kmp_taskdata_t *taskdata;
940  kmp_info_t *thread = __kmp_threads[gtid];
941  kmp_team_t *team = thread->th.th_team;
942  kmp_taskdata_t *parent_task = thread->th.th_current_task;
943  size_t shareds_offset;
944 
945  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
946  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
947  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
948  sizeof_shareds, task_entry));
949 
950  if (parent_task->td_flags.final) {
951  if (flags->merged_if0) {
952  }
953  flags->final = 1;
954  }
955 
956 #if OMP_45_ENABLED
957  if (flags->proxy == TASK_PROXY) {
958  flags->tiedness = TASK_UNTIED;
959  flags->merged_if0 = 1;
960 
961  /* are we running in a sequential parallel or tskm_immediate_exec... we need
962  tasking support enabled */
963  if ((thread->th.th_task_team) == NULL) {
964  /* This should only happen if the team is serialized
965  setup a task team and propagate it to the thread */
966  KMP_DEBUG_ASSERT(team->t.t_serialized);
967  KA_TRACE(30,
968  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
969  gtid));
970  __kmp_task_team_setup(
971  thread, team,
972  1); // 1 indicates setup the current team regardless of nthreads
973  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
974  }
975  kmp_task_team_t *task_team = thread->th.th_task_team;
976 
977  /* tasking must be enabled now as the task might not be pushed */
978  if (!KMP_TASKING_ENABLED(task_team)) {
979  KA_TRACE(
980  30,
981  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
982  __kmp_enable_tasking(task_team, thread);
983  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
984  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
985  // No lock needed since only owner can allocate
986  if (thread_data->td.td_deque == NULL) {
987  __kmp_alloc_task_deque(thread, thread_data);
988  }
989  }
990 
991  if (task_team->tt.tt_found_proxy_tasks == FALSE)
992  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
993  }
994 #endif
995 
996  // Calculate shared structure offset including padding after kmp_task_t struct
997  // to align pointers in shared struct
998  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
999  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1000 
1001  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1002  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1003  shareds_offset));
1004  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1005  sizeof_shareds));
1006 
1007 // Avoid double allocation here by combining shareds with taskdata
1008 #if USE_FAST_MEMORY
1009  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1010  sizeof_shareds);
1011 #else /* ! USE_FAST_MEMORY */
1012  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1013  sizeof_shareds);
1014 #endif /* USE_FAST_MEMORY */
1015  ANNOTATE_HAPPENS_AFTER(taskdata);
1016 
1017  task = KMP_TASKDATA_TO_TASK(taskdata);
1018 
1019 // Make sure task & taskdata are aligned appropriately
1020 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1021  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1022  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1023 #else
1024  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1025  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1026 #endif
1027  if (sizeof_shareds > 0) {
1028  // Avoid double allocation here by combining shareds with taskdata
1029  task->shareds = &((char *)taskdata)[shareds_offset];
1030  // Make sure shareds struct is aligned to pointer size
1031  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1032  0);
1033  } else {
1034  task->shareds = NULL;
1035  }
1036  task->routine = task_entry;
1037  task->part_id = 0; // AC: Always start with 0 part id
1038 
1039  taskdata->td_task_id = KMP_GEN_TASK_ID();
1040  taskdata->td_team = team;
1041  taskdata->td_alloc_thread = thread;
1042  taskdata->td_parent = parent_task;
1043  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1044  taskdata->td_untied_count = 0;
1045  taskdata->td_ident = loc_ref;
1046  taskdata->td_taskwait_ident = NULL;
1047  taskdata->td_taskwait_counter = 0;
1048  taskdata->td_taskwait_thread = 0;
1049  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1050 #if OMP_45_ENABLED
1051  // avoid copying icvs for proxy tasks
1052  if (flags->proxy == TASK_FULL)
1053 #endif
1054  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1055 
1056  taskdata->td_flags.tiedness = flags->tiedness;
1057  taskdata->td_flags.final = flags->final;
1058  taskdata->td_flags.merged_if0 = flags->merged_if0;
1059 #if OMP_40_ENABLED
1060  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1061 #endif // OMP_40_ENABLED
1062 #if OMP_45_ENABLED
1063  taskdata->td_flags.proxy = flags->proxy;
1064  taskdata->td_task_team = thread->th.th_task_team;
1065  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1066 #endif
1067  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1068 
1069  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1070  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1071 
1072  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1073  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1074 
1075  // GEH - Note we serialize the task if the team is serialized to make sure
1076  // implicit parallel region tasks are not left until program termination to
1077  // execute. Also, it helps locality to execute immediately.
1078 
1079  taskdata->td_flags.task_serial =
1080  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1081  taskdata->td_flags.tasking_ser);
1082 
1083  taskdata->td_flags.started = 0;
1084  taskdata->td_flags.executing = 0;
1085  taskdata->td_flags.complete = 0;
1086  taskdata->td_flags.freed = 0;
1087 
1088  taskdata->td_flags.native = flags->native;
1089 
1090  taskdata->td_incomplete_child_tasks = 0;
1091  taskdata->td_allocated_child_tasks = 1; // start at one because counts current
1092 // task and children
1093 #if OMP_40_ENABLED
1094  taskdata->td_taskgroup =
1095  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1096  taskdata->td_dephash = NULL;
1097  taskdata->td_depnode = NULL;
1098 #endif
1099 
1100 // Only need to keep track of child task counts if team parallel and tasking not
1101 // serialized or if it is a proxy task
1102 #if OMP_45_ENABLED
1103  if (flags->proxy == TASK_PROXY ||
1104  !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1105 #else
1106  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1107 #endif
1108  {
1109  KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
1110 #if OMP_40_ENABLED
1111  if (parent_task->td_taskgroup)
1112  KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
1113 #endif
1114  // Only need to keep track of allocated child tasks for explicit tasks since
1115  // implicit not deallocated
1116  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1117  KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
1118  }
1119  }
1120 
1121  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1122  gtid, taskdata, taskdata->td_parent));
1123  ANNOTATE_HAPPENS_BEFORE(task);
1124 
1125 #if OMPT_SUPPORT
1126  __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry);
1127 #endif
1128 
1129  return task;
1130 }
1131 
1132 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1133  kmp_int32 flags, size_t sizeof_kmp_task_t,
1134  size_t sizeof_shareds,
1135  kmp_routine_entry_t task_entry) {
1136  kmp_task_t *retval;
1137  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1138 
1139  input_flags->native = FALSE;
1140 // __kmp_task_alloc() sets up all other runtime flags
1141 
1142 #if OMP_45_ENABLED
1143  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1144  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1145  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1146  input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t,
1147  sizeof_shareds, task_entry));
1148 #else
1149  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1150  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1151  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1152  sizeof_kmp_task_t, sizeof_shareds, task_entry));
1153 #endif
1154 
1155  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1156  sizeof_shareds, task_entry);
1157 
1158  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1159 
1160  return retval;
1161 }
1162 
1163 // __kmp_invoke_task: invoke the specified task
1164 //
1165 // gtid: global thread ID of caller
1166 // task: the task to invoke
1167 // current_task: the task to resume after task invokation
1168 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1169  kmp_taskdata_t *current_task) {
1170  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1171  kmp_uint64 cur_time;
1172 #if OMP_40_ENABLED
1173  int discard = 0 /* false */;
1174 #endif
1175  KA_TRACE(
1176  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1177  gtid, taskdata, current_task));
1178  KMP_DEBUG_ASSERT(task);
1179 #if OMP_45_ENABLED
1180  if (taskdata->td_flags.proxy == TASK_PROXY &&
1181  taskdata->td_flags.complete == 1) {
1182  // This is a proxy task that was already completed but it needs to run
1183  // its bottom-half finish
1184  KA_TRACE(
1185  30,
1186  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1187  gtid, taskdata));
1188 
1189  __kmp_bottom_half_finish_proxy(gtid, task);
1190 
1191  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1192  "proxy task %p, resuming task %p\n",
1193  gtid, taskdata, current_task));
1194 
1195  return;
1196  }
1197 #endif
1198 
1199 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1200  if (__kmp_forkjoin_frames_mode == 3) {
1201  // Get the current time stamp to measure task execution time to correct
1202  // barrier imbalance time
1203  cur_time = __itt_get_timestamp();
1204  }
1205 #endif
1206 
1207 #if OMP_45_ENABLED
1208  // Proxy tasks are not handled by the runtime
1209  if (taskdata->td_flags.proxy != TASK_PROXY) {
1210 #endif
1211  ANNOTATE_HAPPENS_AFTER(task);
1212  __kmp_task_start(gtid, task, current_task);
1213 #if OMP_45_ENABLED
1214  }
1215 #endif
1216 
1217 #if OMPT_SUPPORT
1218  ompt_thread_info_t oldInfo;
1219  kmp_info_t *thread;
1220  if (ompt_enabled) {
1221  // Store the threads states and restore them after the task
1222  thread = __kmp_threads[gtid];
1223  oldInfo = thread->th.ompt_thread_info;
1224  thread->th.ompt_thread_info.wait_id = 0;
1225  thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1226  taskdata->ompt_task_info.frame.exit_runtime_frame =
1227  __builtin_frame_address(0);
1228  }
1229 #endif
1230 
1231 #if OMP_40_ENABLED
1232  // TODO: cancel tasks if the parallel region has also been cancelled
1233  // TODO: check if this sequence can be hoisted above __kmp_task_start
1234  // if cancellation has been enabled for this run ...
1235  if (__kmp_omp_cancellation) {
1236  kmp_info_t *this_thr = __kmp_threads[gtid];
1237  kmp_team_t *this_team = this_thr->th.th_team;
1238  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1239  if ((taskgroup && taskgroup->cancel_request) ||
1240  (this_team->t.t_cancel_request == cancel_parallel)) {
1241  KMP_COUNT_BLOCK(TASK_cancelled);
1242  // this task belongs to a task group and we need to cancel it
1243  discard = 1 /* true */;
1244  }
1245  }
1246 
1247  // Invoke the task routine and pass in relevant data.
1248  // Thunks generated by gcc take a different argument list.
1249  if (!discard) {
1250 #if KMP_STATS_ENABLED
1251  KMP_COUNT_BLOCK(TASK_executed);
1252  switch (KMP_GET_THREAD_STATE()) {
1253  case FORK_JOIN_BARRIER:
1254  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1255  break;
1256  case PLAIN_BARRIER:
1257  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1258  break;
1259  case TASKYIELD:
1260  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1261  break;
1262  case TASKWAIT:
1263  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1264  break;
1265  case TASKGROUP:
1266  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1267  break;
1268  default:
1269  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1270  break;
1271  }
1272 #endif // KMP_STATS_ENABLED
1273 #endif // OMP_40_ENABLED
1274 
1275 #if OMPT_SUPPORT && OMPT_TRACE
1276  /* let OMPT know that we're about to run this task */
1277  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1278  ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1279  current_task->ompt_task_info.task_id,
1280  taskdata->ompt_task_info.task_id);
1281  }
1282 #endif
1283 
1284 #ifdef KMP_GOMP_COMPAT
1285  if (taskdata->td_flags.native) {
1286  ((void (*)(void *))(*(task->routine)))(task->shareds);
1287  } else
1288 #endif /* KMP_GOMP_COMPAT */
1289  {
1290  (*(task->routine))(gtid, task);
1291  }
1292  KMP_POP_PARTITIONED_TIMER();
1293 
1294 #if OMPT_SUPPORT && OMPT_TRACE
1295  /* let OMPT know that we're returning to the callee task */
1296  if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) {
1297  ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1298  taskdata->ompt_task_info.task_id,
1299  current_task->ompt_task_info.task_id);
1300  }
1301 #endif
1302 
1303 #if OMP_40_ENABLED
1304  }
1305 #endif // OMP_40_ENABLED
1306 
1307 #if OMPT_SUPPORT
1308  if (ompt_enabled) {
1309  thread->th.ompt_thread_info = oldInfo;
1310  taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1311  }
1312 #endif
1313 
1314 #if OMP_45_ENABLED
1315  // Proxy tasks are not handled by the runtime
1316  if (taskdata->td_flags.proxy != TASK_PROXY) {
1317 #endif
1318  ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1319  __kmp_task_finish(gtid, task, current_task);
1320 #if OMP_45_ENABLED
1321  }
1322 #endif
1323 
1324 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1325  // Barrier imbalance - correct arrive time after the task finished
1326  if (__kmp_forkjoin_frames_mode == 3) {
1327  kmp_info_t *this_thr = __kmp_threads[gtid];
1328  if (this_thr->th.th_bar_arrive_time) {
1329  this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1330  }
1331  }
1332 #endif
1333  KA_TRACE(
1334  30,
1335  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1336  gtid, taskdata, current_task));
1337  return;
1338 }
1339 
1340 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1341 //
1342 // loc_ref: location of original task pragma (ignored)
1343 // gtid: Global Thread ID of encountering thread
1344 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1345 // Returns:
1346 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1347 // be resumed later.
1348 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1349 // resumed later.
1350 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1351  kmp_task_t *new_task) {
1352  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1353 
1354  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1355  loc_ref, new_taskdata));
1356 
1357  /* Should we execute the new task or queue it? For now, let's just always try
1358  to queue it. If the queue fills up, then we'll execute it. */
1359 
1360  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1361  { // Execute this task immediately
1362  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1363  new_taskdata->td_flags.task_serial = 1;
1364  __kmp_invoke_task(gtid, new_task, current_task);
1365  }
1366 
1367  KA_TRACE(
1368  10,
1369  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1370  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1371  gtid, loc_ref, new_taskdata));
1372 
1373  ANNOTATE_HAPPENS_BEFORE(new_task);
1374  return TASK_CURRENT_NOT_QUEUED;
1375 }
1376 
1377 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1378 //
1379 // gtid: Global Thread ID of encountering thread
1380 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1381 // serialize_immediate: if TRUE then if the task is executed immediately its
1382 // execution will be serialized
1383 // Returns:
1384 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1385 // be resumed later.
1386 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1387 // resumed later.
1388 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1389  bool serialize_immediate) {
1390  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1391 
1392 #if OMPT_SUPPORT
1393  if (ompt_enabled) {
1394  new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1395  __builtin_frame_address(1);
1396  }
1397 #endif
1398 
1399 /* Should we execute the new task or queue it? For now, let's just always try to
1400  queue it. If the queue fills up, then we'll execute it. */
1401 #if OMP_45_ENABLED
1402  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1403  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1404 #else
1405  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1406 #endif
1407  { // Execute this task immediately
1408  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1409  if (serialize_immediate)
1410  new_taskdata->td_flags.task_serial = 1;
1411  __kmp_invoke_task(gtid, new_task, current_task);
1412  }
1413 
1414 #if OMPT_SUPPORT
1415  if (ompt_enabled) {
1416  new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1417  }
1418 #endif
1419 
1420  ANNOTATE_HAPPENS_BEFORE(new_task);
1421  return TASK_CURRENT_NOT_QUEUED;
1422 }
1423 
1424 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1425 // non-thread-switchable task from the parent thread only!
1426 //
1427 // loc_ref: location of original task pragma (ignored)
1428 // gtid: Global Thread ID of encountering thread
1429 // new_task: non-thread-switchable task thunk allocated by
1430 // __kmp_omp_task_alloc()
1431 // Returns:
1432 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1433 // be resumed later.
1434 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1435 // resumed later.
1436 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1437  kmp_task_t *new_task) {
1438  kmp_int32 res;
1439  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1440 
1441 #if KMP_DEBUG
1442  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1443 #endif
1444  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1445  new_taskdata));
1446 
1447  res = __kmp_omp_task(gtid, new_task, true);
1448 
1449  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1450  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1451  gtid, loc_ref, new_taskdata));
1452  return res;
1453 }
1454 
1455 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1456 // complete
1457 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1458  kmp_taskdata_t *taskdata;
1459  kmp_info_t *thread;
1460  int thread_finished = FALSE;
1461  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1462 
1463  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1464 
1465  if (__kmp_tasking_mode != tskm_immediate_exec) {
1466  thread = __kmp_threads[gtid];
1467  taskdata = thread->th.th_current_task;
1468 #if OMPT_SUPPORT && OMPT_TRACE
1469  ompt_task_id_t my_task_id;
1470  ompt_parallel_id_t my_parallel_id;
1471 
1472  if (ompt_enabled) {
1473  kmp_team_t *team = thread->th.th_team;
1474  my_task_id = taskdata->ompt_task_info.task_id;
1475  my_parallel_id = team->t.ompt_team_info.parallel_id;
1476 
1477  taskdata->ompt_task_info.frame.reenter_runtime_frame =
1478  __builtin_frame_address(1);
1479  if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1480  ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id,
1481  my_task_id);
1482  }
1483  }
1484 #endif
1485 
1486 // Debugger: The taskwait is active. Store location and thread encountered the
1487 // taskwait.
1488 #if USE_ITT_BUILD
1489 // Note: These values are used by ITT events as well.
1490 #endif /* USE_ITT_BUILD */
1491  taskdata->td_taskwait_counter += 1;
1492  taskdata->td_taskwait_ident = loc_ref;
1493  taskdata->td_taskwait_thread = gtid + 1;
1494 
1495 #if USE_ITT_BUILD
1496  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1497  if (itt_sync_obj != NULL)
1498  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1499 #endif /* USE_ITT_BUILD */
1500 
1501  bool must_wait =
1502  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1503 
1504 #if OMP_45_ENABLED
1505  must_wait = must_wait || (thread->th.th_task_team != NULL &&
1506  thread->th.th_task_team->tt.tt_found_proxy_tasks);
1507 #endif
1508  if (must_wait) {
1509  kmp_flag_32 flag(
1510  RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks),
1511  0U);
1512  while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
1513  flag.execute_tasks(thread, gtid, FALSE,
1514  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1515  __kmp_task_stealing_constraint);
1516  }
1517  }
1518 #if USE_ITT_BUILD
1519  if (itt_sync_obj != NULL)
1520  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1521 #endif /* USE_ITT_BUILD */
1522 
1523  // Debugger: The taskwait is completed. Location remains, but thread is
1524  // negated.
1525  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1526 
1527 #if OMPT_SUPPORT && OMPT_TRACE
1528  if (ompt_enabled) {
1529  if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1530  ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id,
1531  my_task_id);
1532  }
1533  taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1534  }
1535 #endif
1536  ANNOTATE_HAPPENS_AFTER(taskdata);
1537  }
1538 
1539  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1540  "returning TASK_CURRENT_NOT_QUEUED\n",
1541  gtid, taskdata));
1542 
1543  return TASK_CURRENT_NOT_QUEUED;
1544 }
1545 
1546 // __kmpc_omp_taskyield: switch to a different task
1547 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1548  kmp_taskdata_t *taskdata;
1549  kmp_info_t *thread;
1550  int thread_finished = FALSE;
1551 
1552  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1553  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1554 
1555  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1556  gtid, loc_ref, end_part));
1557 
1558  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1559  thread = __kmp_threads[gtid];
1560  taskdata = thread->th.th_current_task;
1561 // Should we model this as a task wait or not?
1562 // Debugger: The taskwait is active. Store location and thread encountered the
1563 // taskwait.
1564 #if USE_ITT_BUILD
1565 // Note: These values are used by ITT events as well.
1566 #endif /* USE_ITT_BUILD */
1567  taskdata->td_taskwait_counter += 1;
1568  taskdata->td_taskwait_ident = loc_ref;
1569  taskdata->td_taskwait_thread = gtid + 1;
1570 
1571 #if USE_ITT_BUILD
1572  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1573  if (itt_sync_obj != NULL)
1574  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1575 #endif /* USE_ITT_BUILD */
1576  if (!taskdata->td_flags.team_serial) {
1577  kmp_task_team_t *task_team = thread->th.th_task_team;
1578  if (task_team != NULL) {
1579  if (KMP_TASKING_ENABLED(task_team)) {
1580  __kmp_execute_tasks_32(
1581  thread, gtid, NULL, FALSE,
1582  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1583  __kmp_task_stealing_constraint);
1584  }
1585  }
1586  }
1587 #if USE_ITT_BUILD
1588  if (itt_sync_obj != NULL)
1589  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1590 #endif /* USE_ITT_BUILD */
1591 
1592  // Debugger: The taskwait is completed. Location remains, but thread is
1593  // negated.
1594  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1595  }
1596 
1597  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1598  "returning TASK_CURRENT_NOT_QUEUED\n",
1599  gtid, taskdata));
1600 
1601  return TASK_CURRENT_NOT_QUEUED;
1602 }
1603 
1604 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1605 #if OMP_45_ENABLED
1606 // Task Reduction implementation
1607 
1608 typedef struct kmp_task_red_flags {
1609  unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
1610  unsigned reserved31 : 31;
1611 } kmp_task_red_flags_t;
1612 
1613 // internal structure for reduction data item related info
1614 typedef struct kmp_task_red_data {
1615  void *reduce_shar; // shared reduction item
1616  size_t reduce_size; // size of data item
1617  void *reduce_priv; // thread specific data
1618  void *reduce_pend; // end of private data for comparison op
1619  void *reduce_init; // data initialization routine
1620  void *reduce_fini; // data finalization routine
1621  void *reduce_comb; // data combiner routine
1622  kmp_task_red_flags_t flags; // flags for additional info from compiler
1623 } kmp_task_red_data_t;
1624 
1625 // structure sent us by compiler - one per reduction item
1626 typedef struct kmp_task_red_input {
1627  void *reduce_shar; // shared reduction item
1628  size_t reduce_size; // size of data item
1629  void *reduce_init; // data initialization routine
1630  void *reduce_fini; // data finalization routine
1631  void *reduce_comb; // data combiner routine
1632  kmp_task_red_flags_t flags; // flags for additional info from compiler
1633 } kmp_task_red_input_t;
1634 
1644 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
1645  kmp_info_t *thread = __kmp_threads[gtid];
1646  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
1647  kmp_int32 nth = thread->th.th_team_nproc;
1648  kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
1649  kmp_task_red_data_t *arr;
1650 
1651  // check input data just in case
1652  KMP_ASSERT(tg != NULL);
1653  KMP_ASSERT(data != NULL);
1654  KMP_ASSERT(num > 0);
1655  if (nth == 1) {
1656  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
1657  gtid, tg));
1658  return (void *)tg;
1659  }
1660  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
1661  gtid, tg, num));
1662  arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
1663  thread, num * sizeof(kmp_task_red_data_t));
1664  for (int i = 0; i < num; ++i) {
1665  void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
1666  size_t size = input[i].reduce_size - 1;
1667  // round the size up to cache line per thread-specific item
1668  size += CACHE_LINE - size % CACHE_LINE;
1669  KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
1670  arr[i].reduce_shar = input[i].reduce_shar;
1671  arr[i].reduce_size = size;
1672  arr[i].reduce_init = input[i].reduce_init;
1673  arr[i].reduce_fini = input[i].reduce_fini;
1674  arr[i].reduce_comb = input[i].reduce_comb;
1675  arr[i].flags = input[i].flags;
1676  if (!input[i].flags.lazy_priv) {
1677  // allocate cache-line aligned block and fill it with zeros
1678  arr[i].reduce_priv = __kmp_allocate(nth * size);
1679  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
1680  if (f_init != NULL) {
1681  // initialize thread-specific items
1682  for (int j = 0; j < nth; ++j) {
1683  f_init((char *)(arr[i].reduce_priv) + j * size);
1684  }
1685  }
1686  } else {
1687  // only allocate space for pointers now,
1688  // objects will be lazily allocated/initialized once requested
1689  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
1690  }
1691  }
1692  tg->reduce_data = (void *)arr;
1693  tg->reduce_num_data = num;
1694  return (void *)tg;
1695 }
1696 
1706 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
1707  kmp_info_t *thread = __kmp_threads[gtid];
1708  kmp_int32 nth = thread->th.th_team_nproc;
1709  if (nth == 1)
1710  return data; // nothing to do
1711 
1712  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
1713  if (tg == NULL)
1714  tg = thread->th.th_current_task->td_taskgroup;
1715  KMP_ASSERT(tg != NULL);
1716  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
1717  kmp_int32 num = tg->reduce_num_data;
1718  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1719 
1720  KMP_ASSERT(data != NULL);
1721  while (tg != NULL) {
1722  for (int i = 0; i < num; ++i) {
1723  if (!arr[i].flags.lazy_priv) {
1724  if (data == arr[i].reduce_shar ||
1725  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
1726  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
1727  } else {
1728  // check shared location first
1729  void **p_priv = (void **)(arr[i].reduce_priv);
1730  if (data == arr[i].reduce_shar)
1731  goto found;
1732  // check if we get some thread specific location as parameter
1733  for (int j = 0; j < nth; ++j)
1734  if (data == p_priv[j])
1735  goto found;
1736  continue; // not found, continue search
1737  found:
1738  if (p_priv[tid] == NULL) {
1739  // allocate thread specific object lazily
1740  void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
1741  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
1742  if (f_init != NULL) {
1743  f_init(p_priv[tid]);
1744  }
1745  }
1746  return p_priv[tid];
1747  }
1748  }
1749  tg = tg->parent;
1750  arr = (kmp_task_red_data_t *)(tg->reduce_data);
1751  num = tg->reduce_num_data;
1752  }
1753  KMP_ASSERT2(0, "Unknown task reduction item");
1754  return NULL; // ERROR, this line never executed
1755 }
1756 
1757 // Finalize task reduction.
1758 // Called from __kmpc_end_taskgroup()
1759 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
1760  kmp_int32 nth = th->th.th_team_nproc;
1761  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
1762  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
1763  kmp_int32 num = tg->reduce_num_data;
1764  for (int i = 0; i < num; ++i) {
1765  void *sh_data = arr[i].reduce_shar;
1766  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
1767  void (*f_comb)(void *, void *) =
1768  (void (*)(void *, void *))(arr[i].reduce_comb);
1769  if (!arr[i].flags.lazy_priv) {
1770  void *pr_data = arr[i].reduce_priv;
1771  size_t size = arr[i].reduce_size;
1772  for (int j = 0; j < nth; ++j) {
1773  void *priv_data = (char *)pr_data + j * size;
1774  f_comb(sh_data, priv_data); // combine results
1775  if (f_fini)
1776  f_fini(priv_data); // finalize if needed
1777  }
1778  } else {
1779  void **pr_data = (void **)(arr[i].reduce_priv);
1780  for (int j = 0; j < nth; ++j) {
1781  if (pr_data[j] != NULL) {
1782  f_comb(sh_data, pr_data[j]); // combine results
1783  if (f_fini)
1784  f_fini(pr_data[j]); // finalize if needed
1785  __kmp_free(pr_data[j]);
1786  }
1787  }
1788  }
1789  __kmp_free(arr[i].reduce_priv);
1790  }
1791  __kmp_thread_free(th, arr);
1792  tg->reduce_data = NULL;
1793  tg->reduce_num_data = 0;
1794 }
1795 #endif
1796 
1797 #if OMP_40_ENABLED
1798 // __kmpc_taskgroup: Start a new taskgroup
1799 void __kmpc_taskgroup(ident_t *loc, int gtid) {
1800  kmp_info_t *thread = __kmp_threads[gtid];
1801  kmp_taskdata_t *taskdata = thread->th.th_current_task;
1802  kmp_taskgroup_t *tg_new =
1803  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
1804  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
1805  tg_new->count = 0;
1806  tg_new->cancel_request = cancel_noreq;
1807  tg_new->parent = taskdata->td_taskgroup;
1808 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1809 #if OMP_45_ENABLED
1810  tg_new->reduce_data = NULL;
1811  tg_new->reduce_num_data = 0;
1812 #endif
1813  taskdata->td_taskgroup = tg_new;
1814 }
1815 
1816 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1817 // and its descendants are complete
1818 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
1819  kmp_info_t *thread = __kmp_threads[gtid];
1820  kmp_taskdata_t *taskdata = thread->th.th_current_task;
1821  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1822  int thread_finished = FALSE;
1823 
1824  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
1825  KMP_DEBUG_ASSERT(taskgroup != NULL);
1826  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1827 
1828  if (__kmp_tasking_mode != tskm_immediate_exec) {
1829 #if USE_ITT_BUILD
1830  // For ITT the taskgroup wait is similar to taskwait until we need to
1831  // distinguish them
1832  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1833  if (itt_sync_obj != NULL)
1834  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1835 #endif /* USE_ITT_BUILD */
1836 
1837 #if OMP_45_ENABLED
1838  if (!taskdata->td_flags.team_serial ||
1839  (thread->th.th_task_team != NULL &&
1840  thread->th.th_task_team->tt.tt_found_proxy_tasks))
1841 #else
1842  if (!taskdata->td_flags.team_serial)
1843 #endif
1844  {
1845  kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U);
1846  while (TCR_4(taskgroup->count) != 0) {
1847  flag.execute_tasks(thread, gtid, FALSE,
1848  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1849  __kmp_task_stealing_constraint);
1850  }
1851  }
1852 
1853 #if USE_ITT_BUILD
1854  if (itt_sync_obj != NULL)
1855  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1856 #endif /* USE_ITT_BUILD */
1857  }
1858  KMP_DEBUG_ASSERT(taskgroup->count == 0);
1859 
1860 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
1861 #if OMP_45_ENABLED
1862  if (taskgroup->reduce_data != NULL) // need to reduce?
1863  __kmp_task_reduction_fini(thread, taskgroup);
1864 #endif
1865  // Restore parent taskgroup for the current task
1866  taskdata->td_taskgroup = taskgroup->parent;
1867  __kmp_thread_free(thread, taskgroup);
1868 
1869  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
1870  gtid, taskdata));
1871  ANNOTATE_HAPPENS_AFTER(taskdata);
1872 }
1873 #endif
1874 
1875 // __kmp_remove_my_task: remove a task from my own deque
1876 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
1877  kmp_task_team_t *task_team,
1878  kmp_int32 is_constrained) {
1879  kmp_task_t *task;
1880  kmp_taskdata_t *taskdata;
1881  kmp_thread_data_t *thread_data;
1882  kmp_uint32 tail;
1883 
1884  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1885  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
1886  NULL); // Caller should check this condition
1887 
1888  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
1889 
1890  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1891  gtid, thread_data->td.td_deque_ntasks,
1892  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1893 
1894  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1895  KA_TRACE(10,
1896  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
1897  "ntasks=%d head=%u tail=%u\n",
1898  gtid, thread_data->td.td_deque_ntasks,
1899  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1900  return NULL;
1901  }
1902 
1903  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
1904 
1905  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
1906  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1907  KA_TRACE(10,
1908  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1909  "ntasks=%d head=%u tail=%u\n",
1910  gtid, thread_data->td.td_deque_ntasks,
1911  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1912  return NULL;
1913  }
1914 
1915  tail = (thread_data->td.td_deque_tail - 1) &
1916  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
1917  taskdata = thread_data->td.td_deque[tail];
1918 
1919  if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1920  // we need to check if the candidate obeys task scheduling constraint:
1921  // only child of current task can be scheduled
1922  kmp_taskdata_t *current = thread->th.th_current_task;
1923  kmp_int32 level = current->td_level;
1924  kmp_taskdata_t *parent = taskdata->td_parent;
1925  while (parent != current && parent->td_level > level) {
1926  parent = parent->td_parent; // check generation up to the level of the
1927  // current task
1928  KMP_DEBUG_ASSERT(parent != NULL);
1929  }
1930  if (parent != current) {
1931  // If the tail task is not a child, then no other child can appear in the
1932  // deque.
1933  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1934  KA_TRACE(10,
1935  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
1936  "ntasks=%d head=%u tail=%u\n",
1937  gtid, thread_data->td.td_deque_ntasks,
1938  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1939  return NULL;
1940  }
1941  }
1942 
1943  thread_data->td.td_deque_tail = tail;
1944  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
1945 
1946  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
1947 
1948  KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: "
1949  "ntasks=%d head=%u tail=%u\n",
1950  gtid, taskdata, thread_data->td.td_deque_ntasks,
1951  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
1952 
1953  task = KMP_TASKDATA_TO_TASK(taskdata);
1954  return task;
1955 }
1956 
1957 // __kmp_steal_task: remove a task from another thread's deque
1958 // Assume that calling thread has already checked existence of
1959 // task_team thread_data before calling this routine.
1960 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid,
1961  kmp_task_team_t *task_team,
1962  volatile kmp_int32 *unfinished_threads,
1963  int *thread_finished,
1964  kmp_int32 is_constrained) {
1965  kmp_task_t *task;
1966  kmp_taskdata_t *taskdata;
1967  kmp_thread_data_t *victim_td, *threads_data;
1968  kmp_int32 victim_tid;
1969 
1970  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
1971 
1972  threads_data = task_team->tt.tt_threads_data;
1973  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
1974 
1975  victim_tid = victim->th.th_info.ds.ds_tid;
1976  victim_td = &threads_data[victim_tid];
1977 
1978  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
1979  "task_team=%p ntasks=%d "
1980  "head=%u tail=%u\n",
1981  gtid, __kmp_gtid_from_thread(victim), task_team,
1982  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1983  victim_td->td.td_deque_tail));
1984 
1985  if ((TCR_4(victim_td->td.td_deque_ntasks) ==
1986  0) || // Caller should not check this condition
1987  (TCR_PTR(victim->th.th_task_team) !=
1988  task_team)) // GEH: why would this happen?
1989  {
1990  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
1991  "task_team=%p "
1992  "ntasks=%d head=%u tail=%u\n",
1993  gtid, __kmp_gtid_from_thread(victim), task_team,
1994  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1995  victim_td->td.td_deque_tail));
1996  return NULL;
1997  }
1998 
1999  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2000 
2001  // Check again after we acquire the lock
2002  if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) ||
2003  (TCR_PTR(victim->th.th_task_team) !=
2004  task_team)) // GEH: why would this happen?
2005  {
2006  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2007  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2008  "task_team=%p "
2009  "ntasks=%d head=%u tail=%u\n",
2010  gtid, __kmp_gtid_from_thread(victim), task_team,
2011  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2012  victim_td->td.td_deque_tail));
2013  return NULL;
2014  }
2015 
2016  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2017 
2018  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2019  if (is_constrained) {
2020  // we need to check if the candidate obeys task scheduling constraint:
2021  // only descendant of current task can be scheduled
2022  kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task;
2023  kmp_int32 level = current->td_level;
2024  kmp_taskdata_t *parent = taskdata->td_parent;
2025  while (parent != current && parent->td_level > level) {
2026  parent = parent->td_parent; // check generation up to the level of the
2027  // current task
2028  KMP_DEBUG_ASSERT(parent != NULL);
2029  }
2030  if (parent != current) {
2031  // If the head task is not a descendant of the current task then do not
2032  // steal it. No other task in victim's deque can be a descendant of the
2033  // current task.
2034  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2035  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from "
2036  "T#%d: task_team=%p "
2037  "ntasks=%d head=%u tail=%u\n",
2038  gtid,
2039  __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr),
2040  task_team, victim_td->td.td_deque_ntasks,
2041  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2042  return NULL;
2043  }
2044  }
2045  // Bump head pointer and Wrap.
2046  victim_td->td.td_deque_head =
2047  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2048  if (*thread_finished) {
2049  // We need to un-mark this victim as a finished victim. This must be done
2050  // before releasing the lock, or else other threads (starting with the
2051  // master victim) might be prematurely released from the barrier!!!
2052  kmp_int32 count;
2053 
2054  count = KMP_TEST_THEN_INC32(unfinished_threads);
2055 
2056  KA_TRACE(
2057  20,
2058  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2059  gtid, count + 1, task_team));
2060 
2061  *thread_finished = FALSE;
2062  }
2063  TCW_4(victim_td->td.td_deque_ntasks,
2064  TCR_4(victim_td->td.td_deque_ntasks) - 1);
2065 
2066  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2067 
2068  KMP_COUNT_BLOCK(TASK_stolen);
2069  KA_TRACE(
2070  10,
2071  ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
2072  "ntasks=%d head=%u tail=%u\n",
2073  gtid, taskdata, __kmp_gtid_from_thread(victim), task_team,
2074  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2075  victim_td->td.td_deque_tail));
2076 
2077  task = KMP_TASKDATA_TO_TASK(taskdata);
2078  return task;
2079 }
2080 
2081 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2082 // condition is statisfied (return true) or there are none left (return false).
2083 //
2084 // final_spin is TRUE if this is the spin at the release barrier.
2085 // thread_finished indicates whether the thread is finished executing all
2086 // the tasks it has on its deque, and is at the release barrier.
2087 // spinner is the location on which to spin.
2088 // spinner == NULL means only execute a single task and return.
2089 // checker is the value to check to terminate the spin.
2090 template <class C>
2091 static inline int __kmp_execute_tasks_template(
2092  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2093  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2094  kmp_int32 is_constrained) {
2095  kmp_task_team_t *task_team = thread->th.th_task_team;
2096  kmp_thread_data_t *threads_data;
2097  kmp_task_t *task;
2098  kmp_info_t *other_thread;
2099  kmp_taskdata_t *current_task = thread->th.th_current_task;
2100  volatile kmp_int32 *unfinished_threads;
2101  kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0,
2102  tid = thread->th.th_info.ds.ds_tid;
2103 
2104  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2105  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2106 
2107  if (task_team == NULL)
2108  return FALSE;
2109 
2110  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2111  "*thread_finished=%d\n",
2112  gtid, final_spin, *thread_finished));
2113 
2114  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2115  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2116  KMP_DEBUG_ASSERT(threads_data != NULL);
2117 
2118  nthreads = task_team->tt.tt_nproc;
2119  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2120 #if OMP_45_ENABLED
2121  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2122 #else
2123  KMP_DEBUG_ASSERT(nthreads > 1);
2124 #endif
2125  KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0);
2126 
2127  while (1) { // Outer loop keeps trying to find tasks in case of single thread
2128  // getting tasks from target constructs
2129  while (1) { // Inner loop to find a task and execute it
2130  task = NULL;
2131  if (use_own_tasks) { // check on own queue first
2132  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2133  }
2134  if ((task == NULL) && (nthreads > 1)) { // Steal a task
2135  int asleep = 1;
2136  use_own_tasks = 0;
2137  // Try to steal from the last place I stole from successfully.
2138  if (victim == -2) { // haven't stolen anything yet
2139  victim = threads_data[tid].td.td_deque_last_stolen;
2140  if (victim !=
2141  -1) // if we have a last stolen from victim, get the thread
2142  other_thread = threads_data[victim].td.td_thr;
2143  }
2144  if (victim != -1) { // found last victim
2145  asleep = 0;
2146  } else if (!new_victim) { // no recent steals and we haven't already
2147  // used a new victim; select a random thread
2148  do { // Find a different thread to steal work from.
2149  // Pick a random thread. Initial plan was to cycle through all the
2150  // threads, and only return if we tried to steal from every thread,
2151  // and failed. Arch says that's not such a great idea.
2152  victim = __kmp_get_random(thread) % (nthreads - 1);
2153  if (victim >= tid) {
2154  ++victim; // Adjusts random distribution to exclude self
2155  }
2156  // Found a potential victim
2157  other_thread = threads_data[victim].td.td_thr;
2158  // There is a slight chance that __kmp_enable_tasking() did not wake
2159  // up all threads waiting at the barrier. If victim is sleeping,
2160  // then wake it up. Since we were going to pay the cache miss
2161  // penalty for referencing another thread's kmp_info_t struct
2162  // anyway,
2163  // the check shouldn't cost too much performance at this point. In
2164  // extra barrier mode, tasks do not sleep at the separate tasking
2165  // barrier, so this isn't a problem.
2166  asleep = 0;
2167  if ((__kmp_tasking_mode == tskm_task_teams) &&
2168  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2169  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2170  NULL)) {
2171  asleep = 1;
2172  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2173  other_thread->th.th_sleep_loc);
2174  // A sleeping thread should not have any tasks on it's queue.
2175  // There is a slight possibility that it resumes, steals a task
2176  // from another thread, which spawns more tasks, all in the time
2177  // that it takes this thread to check => don't write an assertion
2178  // that the victim's queue is empty. Try stealing from a
2179  // different thread.
2180  }
2181  } while (asleep);
2182  }
2183 
2184  if (!asleep) {
2185  // We have a victim to try to steal from
2186  task = __kmp_steal_task(other_thread, gtid, task_team,
2187  unfinished_threads, thread_finished,
2188  is_constrained);
2189  }
2190  if (task != NULL) { // set last stolen to victim
2191  if (threads_data[tid].td.td_deque_last_stolen != victim) {
2192  threads_data[tid].td.td_deque_last_stolen = victim;
2193  // The pre-refactored code did not try more than 1 successful new
2194  // vicitm, unless the last one generated more local tasks;
2195  // new_victim keeps track of this
2196  new_victim = 1;
2197  }
2198  } else { // No tasks found; unset last_stolen
2199  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2200  victim = -2; // no successful victim found
2201  }
2202  }
2203 
2204  if (task == NULL) // break out of tasking loop
2205  break;
2206 
2207 // Found a task; execute it
2208 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2209  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2210  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2211  // get the object reliably
2212  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2213  }
2214  __kmp_itt_task_starting(itt_sync_obj);
2215  }
2216 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2217  __kmp_invoke_task(gtid, task, current_task);
2218 #if USE_ITT_BUILD
2219  if (itt_sync_obj != NULL)
2220  __kmp_itt_task_finished(itt_sync_obj);
2221 #endif /* USE_ITT_BUILD */
2222  // If this thread is only partway through the barrier and the condition is
2223  // met, then return now, so that the barrier gather/release pattern can
2224  // proceed. If this thread is in the last spin loop in the barrier,
2225  // waiting to be released, we know that the termination condition will not
2226  // be satisified, so don't waste any cycles checking it.
2227  if (flag == NULL || (!final_spin && flag->done_check())) {
2228  KA_TRACE(
2229  15,
2230  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2231  gtid));
2232  return TRUE;
2233  }
2234  if (thread->th.th_task_team == NULL) {
2235  break;
2236  }
2237  // Yield before executing next task
2238  KMP_YIELD(__kmp_library == library_throughput);
2239  // If execution of a stolen task results in more tasks being placed on our
2240  // run queue, reset use_own_tasks
2241  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2242  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2243  "other tasks, restart\n",
2244  gtid));
2245  use_own_tasks = 1;
2246  new_victim = 0;
2247  }
2248  }
2249 
2250 // The task source has been exhausted. If in final spin loop of barrier, check
2251 // if termination condition is satisfied.
2252 #if OMP_45_ENABLED
2253  // The work queue may be empty but there might be proxy tasks still
2254  // executing
2255  if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2256 #else
2257  if (final_spin)
2258 #endif
2259  {
2260  // First, decrement the #unfinished threads, if that has not already been
2261  // done. This decrement might be to the spin location, and result in the
2262  // termination condition being satisfied.
2263  if (!*thread_finished) {
2264  kmp_int32 count;
2265 
2266  count = KMP_TEST_THEN_DEC32(unfinished_threads) - 1;
2267  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2268  "unfinished_threads to %d task_team=%p\n",
2269  gtid, count, task_team));
2270  *thread_finished = TRUE;
2271  }
2272 
2273  // It is now unsafe to reference thread->th.th_team !!!
2274  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2275  // thread to pass through the barrier, where it might reset each thread's
2276  // th.th_team field for the next parallel region. If we can steal more
2277  // work, we know that this has not happened yet.
2278  if (flag != NULL && flag->done_check()) {
2279  KA_TRACE(
2280  15,
2281  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2282  gtid));
2283  return TRUE;
2284  }
2285  }
2286 
2287  // If this thread's task team is NULL, master has recognized that there are
2288  // no more tasks; bail out
2289  if (thread->th.th_task_team == NULL) {
2290  KA_TRACE(15,
2291  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
2292  return FALSE;
2293  }
2294 
2295 #if OMP_45_ENABLED
2296  // We could be getting tasks from target constructs; if this is the only
2297  // thread, keep trying to execute tasks from own queue
2298  if (nthreads == 1)
2299  use_own_tasks = 1;
2300  else
2301 #endif
2302  {
2303  KA_TRACE(15,
2304  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
2305  return FALSE;
2306  }
2307  }
2308 }
2309 
2310 int __kmp_execute_tasks_32(
2311  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2312  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2313  kmp_int32 is_constrained) {
2314  return __kmp_execute_tasks_template(
2315  thread, gtid, flag, final_spin,
2316  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2317 }
2318 
2319 int __kmp_execute_tasks_64(
2320  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2321  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2322  kmp_int32 is_constrained) {
2323  return __kmp_execute_tasks_template(
2324  thread, gtid, flag, final_spin,
2325  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2326 }
2327 
2328 int __kmp_execute_tasks_oncore(
2329  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2330  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2331  kmp_int32 is_constrained) {
2332  return __kmp_execute_tasks_template(
2333  thread, gtid, flag, final_spin,
2334  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2335 }
2336 
2337 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2338 // next barrier so they can assist in executing enqueued tasks.
2339 // First thread in allocates the task team atomically.
2340 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
2341  kmp_info_t *this_thr) {
2342  kmp_thread_data_t *threads_data;
2343  int nthreads, i, is_init_thread;
2344 
2345  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
2346  __kmp_gtid_from_thread(this_thr)));
2347 
2348  KMP_DEBUG_ASSERT(task_team != NULL);
2349  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2350 
2351  nthreads = task_team->tt.tt_nproc;
2352  KMP_DEBUG_ASSERT(nthreads > 0);
2353  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2354 
2355  // Allocate or increase the size of threads_data if necessary
2356  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
2357 
2358  if (!is_init_thread) {
2359  // Some other thread already set up the array.
2360  KA_TRACE(
2361  20,
2362  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2363  __kmp_gtid_from_thread(this_thr)));
2364  return;
2365  }
2366  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2367  KMP_DEBUG_ASSERT(threads_data != NULL);
2368 
2369  if ((__kmp_tasking_mode == tskm_task_teams) &&
2370  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
2371  // Release any threads sleeping at the barrier, so that they can steal
2372  // tasks and execute them. In extra barrier mode, tasks do not sleep
2373  // at the separate tasking barrier, so this isn't a problem.
2374  for (i = 0; i < nthreads; i++) {
2375  volatile void *sleep_loc;
2376  kmp_info_t *thread = threads_data[i].td.td_thr;
2377 
2378  if (i == this_thr->th.th_info.ds.ds_tid) {
2379  continue;
2380  }
2381  // Since we haven't locked the thread's suspend mutex lock at this
2382  // point, there is a small window where a thread might be putting
2383  // itself to sleep, but hasn't set the th_sleep_loc field yet.
2384  // To work around this, __kmp_execute_tasks_template() periodically checks
2385  // see if other threads are sleeping (using the same random mechanism that
2386  // is used for task stealing) and awakens them if they are.
2387  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2388  NULL) {
2389  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2390  __kmp_gtid_from_thread(this_thr),
2391  __kmp_gtid_from_thread(thread)));
2392  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2393  } else {
2394  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2395  __kmp_gtid_from_thread(this_thr),
2396  __kmp_gtid_from_thread(thread)));
2397  }
2398  }
2399  }
2400 
2401  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
2402  __kmp_gtid_from_thread(this_thr)));
2403 }
2404 
2405 /* // TODO: Check the comment consistency
2406  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
2407  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2408  * After a child * thread checks into a barrier and calls __kmp_release() from
2409  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2410  * longer assume that the kmp_team_t structure is intact (at any moment, the
2411  * master thread may exit the barrier code and free the team data structure,
2412  * and return the threads to the thread pool).
2413  *
2414  * This does not work with the the tasking code, as the thread is still
2415  * expected to participate in the execution of any tasks that may have been
2416  * spawned my a member of the team, and the thread still needs access to all
2417  * to each thread in the team, so that it can steal work from it.
2418  *
2419  * Enter the existence of the kmp_task_team_t struct. It employs a reference
2420  * counting mechanims, and is allocated by the master thread before calling
2421  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2422  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
2423  * of the kmp_task_team_t structs for consecutive barriers can overlap
2424  * (and will, unless the master thread is the last thread to exit the barrier
2425  * release phase, which is not typical).
2426  *
2427  * The existence of such a struct is useful outside the context of tasking,
2428  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2429  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2430  * libraries.
2431  *
2432  * We currently use the existence of the threads array as an indicator that
2433  * tasks were spawned since the last barrier. If the structure is to be
2434  * useful outside the context of tasking, then this will have to change, but
2435  * not settting the field minimizes the performance impact of tasking on
2436  * barriers, when no explicit tasks were spawned (pushed, actually).
2437  */
2438 
2439 static kmp_task_team_t *__kmp_free_task_teams =
2440  NULL; // Free list for task_team data structures
2441 // Lock for task team data structures
2442 static kmp_bootstrap_lock_t __kmp_task_team_lock =
2443  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
2444 
2445 // __kmp_alloc_task_deque:
2446 // Allocates a task deque for a particular thread, and initialize the necessary
2447 // data structures relating to the deque. This only happens once per thread
2448 // per task team since task teams are recycled. No lock is needed during
2449 // allocation since each thread allocates its own deque.
2450 static void __kmp_alloc_task_deque(kmp_info_t *thread,
2451  kmp_thread_data_t *thread_data) {
2452  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
2453  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
2454 
2455  // Initialize last stolen task field to "none"
2456  thread_data->td.td_deque_last_stolen = -1;
2457 
2458  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
2459  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
2460  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
2461 
2462  KE_TRACE(
2463  10,
2464  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2465  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
2466  // Allocate space for task deque, and zero the deque
2467  // Cannot use __kmp_thread_calloc() because threads not around for
2468  // kmp_reap_task_team( ).
2469  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
2470  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2471  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2472 }
2473 
2474 // __kmp_realloc_task_deque:
2475 // Re-allocates a task deque for a particular thread, copies the content from
2476 // the old deque and adjusts the necessary data structures relating to the
2477 // deque. This operation must be done with a the deque_lock being held
2478 static void __kmp_realloc_task_deque(kmp_info_t *thread,
2479  kmp_thread_data_t *thread_data) {
2480  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2481  kmp_int32 new_size = 2 * size;
2482 
2483  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
2484  "%d] for thread_data %p\n",
2485  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
2486 
2487  kmp_taskdata_t **new_deque =
2488  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
2489 
2490  int i, j;
2491  for (i = thread_data->td.td_deque_head, j = 0; j < size;
2492  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
2493  new_deque[j] = thread_data->td.td_deque[i];
2494 
2495  __kmp_free(thread_data->td.td_deque);
2496 
2497  thread_data->td.td_deque_head = 0;
2498  thread_data->td.td_deque_tail = size;
2499  thread_data->td.td_deque = new_deque;
2500  thread_data->td.td_deque_size = new_size;
2501 }
2502 
2503 // __kmp_free_task_deque:
2504 // Deallocates a task deque for a particular thread. Happens at library
2505 // deallocation so don't need to reset all thread data fields.
2506 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
2507  if (thread_data->td.td_deque != NULL) {
2508  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2509  TCW_4(thread_data->td.td_deque_ntasks, 0);
2510  __kmp_free(thread_data->td.td_deque);
2511  thread_data->td.td_deque = NULL;
2512  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2513  }
2514 
2515 #ifdef BUILD_TIED_TASK_STACK
2516  // GEH: Figure out what to do here for td_susp_tied_tasks
2517  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
2518  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
2519  }
2520 #endif // BUILD_TIED_TASK_STACK
2521 }
2522 
2523 // __kmp_realloc_task_threads_data:
2524 // Allocates a threads_data array for a task team, either by allocating an
2525 // initial array or enlarging an existing array. Only the first thread to get
2526 // the lock allocs or enlarges the array and re-initializes the array eleemnts.
2527 // That thread returns "TRUE", the rest return "FALSE".
2528 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2529 // The current size is given by task_team -> tt.tt_max_threads.
2530 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
2531  kmp_task_team_t *task_team) {
2532  kmp_thread_data_t **threads_data_p;
2533  kmp_int32 nthreads, maxthreads;
2534  int is_init_thread = FALSE;
2535 
2536  if (TCR_4(task_team->tt.tt_found_tasks)) {
2537  // Already reallocated and initialized.
2538  return FALSE;
2539  }
2540 
2541  threads_data_p = &task_team->tt.tt_threads_data;
2542  nthreads = task_team->tt.tt_nproc;
2543  maxthreads = task_team->tt.tt_max_threads;
2544 
2545  // All threads must lock when they encounter the first task of the implicit
2546  // task region to make sure threads_data fields are (re)initialized before
2547  // used.
2548  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2549 
2550  if (!TCR_4(task_team->tt.tt_found_tasks)) {
2551  // first thread to enable tasking
2552  kmp_team_t *team = thread->th.th_team;
2553  int i;
2554 
2555  is_init_thread = TRUE;
2556  if (maxthreads < nthreads) {
2557 
2558  if (*threads_data_p != NULL) {
2559  kmp_thread_data_t *old_data = *threads_data_p;
2560  kmp_thread_data_t *new_data = NULL;
2561 
2562  KE_TRACE(
2563  10,
2564  ("__kmp_realloc_task_threads_data: T#%d reallocating "
2565  "threads data for task_team %p, new_size = %d, old_size = %d\n",
2566  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
2567  // Reallocate threads_data to have more elements than current array
2568  // Cannot use __kmp_thread_realloc() because threads not around for
2569  // kmp_reap_task_team( ). Note all new array entries are initialized
2570  // to zero by __kmp_allocate().
2571  new_data = (kmp_thread_data_t *)__kmp_allocate(
2572  nthreads * sizeof(kmp_thread_data_t));
2573  // copy old data to new data
2574  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
2575  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
2576 
2577 #ifdef BUILD_TIED_TASK_STACK
2578  // GEH: Figure out if this is the right thing to do
2579  for (i = maxthreads; i < nthreads; i++) {
2580  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2581  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2582  }
2583 #endif // BUILD_TIED_TASK_STACK
2584  // Install the new data and free the old data
2585  (*threads_data_p) = new_data;
2586  __kmp_free(old_data);
2587  } else {
2588  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
2589  "threads data for task_team %p, size = %d\n",
2590  __kmp_gtid_from_thread(thread), task_team, nthreads));
2591  // Make the initial allocate for threads_data array, and zero entries
2592  // Cannot use __kmp_thread_calloc() because threads not around for
2593  // kmp_reap_task_team( ).
2594  ANNOTATE_IGNORE_WRITES_BEGIN();
2595  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
2596  nthreads * sizeof(kmp_thread_data_t));
2597  ANNOTATE_IGNORE_WRITES_END();
2598 #ifdef BUILD_TIED_TASK_STACK
2599  // GEH: Figure out if this is the right thing to do
2600  for (i = 0; i < nthreads; i++) {
2601  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2602  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
2603  }
2604 #endif // BUILD_TIED_TASK_STACK
2605  }
2606  task_team->tt.tt_max_threads = nthreads;
2607  } else {
2608  // If array has (more than) enough elements, go ahead and use it
2609  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
2610  }
2611 
2612  // initialize threads_data pointers back to thread_info structures
2613  for (i = 0; i < nthreads; i++) {
2614  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
2615  thread_data->td.td_thr = team->t.t_threads[i];
2616 
2617  if (thread_data->td.td_deque_last_stolen >= nthreads) {
2618  // The last stolen field survives across teams / barrier, and the number
2619  // of threads may have changed. It's possible (likely?) that a new
2620  // parallel region will exhibit the same behavior as previous region.
2621  thread_data->td.td_deque_last_stolen = -1;
2622  }
2623  }
2624 
2625  KMP_MB();
2626  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
2627  }
2628 
2629  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2630  return is_init_thread;
2631 }
2632 
2633 // __kmp_free_task_threads_data:
2634 // Deallocates a threads_data array for a task team, including any attached
2635 // tasking deques. Only occurs at library shutdown.
2636 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
2637  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
2638  if (task_team->tt.tt_threads_data != NULL) {
2639  int i;
2640  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
2641  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
2642  }
2643  __kmp_free(task_team->tt.tt_threads_data);
2644  task_team->tt.tt_threads_data = NULL;
2645  }
2646  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
2647 }
2648 
2649 // __kmp_allocate_task_team:
2650 // Allocates a task team associated with a specific team, taking it from
2651 // the global task team free list if possible. Also initializes data
2652 // structures.
2653 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
2654  kmp_team_t *team) {
2655  kmp_task_team_t *task_team = NULL;
2656  int nthreads;
2657 
2658  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
2659  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
2660 
2661  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2662  // Take a task team from the task team pool
2663  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2664  if (__kmp_free_task_teams != NULL) {
2665  task_team = __kmp_free_task_teams;
2666  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
2667  task_team->tt.tt_next = NULL;
2668  }
2669  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2670  }
2671 
2672  if (task_team == NULL) {
2673  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
2674  "task team for team %p\n",
2675  __kmp_gtid_from_thread(thread), team));
2676  // Allocate a new task team if one is not available.
2677  // Cannot use __kmp_thread_malloc() because threads not around for
2678  // kmp_reap_task_team( ).
2679  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
2680  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
2681  // AC: __kmp_allocate zeroes returned memory
2682  // task_team -> tt.tt_threads_data = NULL;
2683  // task_team -> tt.tt_max_threads = 0;
2684  // task_team -> tt.tt_next = NULL;
2685  }
2686 
2687  TCW_4(task_team->tt.tt_found_tasks, FALSE);
2688 #if OMP_45_ENABLED
2689  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2690 #endif
2691  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
2692 
2693  TCW_4(task_team->tt.tt_unfinished_threads, nthreads);
2694  TCW_4(task_team->tt.tt_active, TRUE);
2695 
2696  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
2697  "unfinished_threads init'd to %d\n",
2698  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
2699  task_team->tt.tt_unfinished_threads));
2700  return task_team;
2701 }
2702 
2703 // __kmp_free_task_team:
2704 // Frees the task team associated with a specific thread, and adds it
2705 // to the global task team free list.
2706 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
2707  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
2708  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
2709 
2710  // Put task team back on free list
2711  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2712 
2713  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
2714  task_team->tt.tt_next = __kmp_free_task_teams;
2715  TCW_PTR(__kmp_free_task_teams, task_team);
2716 
2717  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2718 }
2719 
2720 // __kmp_reap_task_teams:
2721 // Free all the task teams on the task team free list.
2722 // Should only be done during library shutdown.
2723 // Cannot do anything that needs a thread structure or gtid since they are
2724 // already gone.
2725 void __kmp_reap_task_teams(void) {
2726  kmp_task_team_t *task_team;
2727 
2728  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2729  // Free all task_teams on the free list
2730  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
2731  while ((task_team = __kmp_free_task_teams) != NULL) {
2732  __kmp_free_task_teams = task_team->tt.tt_next;
2733  task_team->tt.tt_next = NULL;
2734 
2735  // Free threads_data if necessary
2736  if (task_team->tt.tt_threads_data != NULL) {
2737  __kmp_free_task_threads_data(task_team);
2738  }
2739  __kmp_free(task_team);
2740  }
2741  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
2742  }
2743 }
2744 
2745 // __kmp_wait_to_unref_task_teams:
2746 // Some threads could still be in the fork barrier release code, possibly
2747 // trying to steal tasks. Wait for each thread to unreference its task team.
2748 void __kmp_wait_to_unref_task_teams(void) {
2749  kmp_info_t *thread;
2750  kmp_uint32 spins;
2751  int done;
2752 
2753  KMP_INIT_YIELD(spins);
2754 
2755  for (;;) {
2756  done = TRUE;
2757 
2758  // TODO: GEH - this may be is wrong because some sync would be necessary
2759  // in case threads are added to the pool during the traversal. Need to
2760  // verify that lock for thread pool is held when calling this routine.
2761  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
2762  thread = thread->th.th_next_pool) {
2763 #if KMP_OS_WINDOWS
2764  DWORD exit_val;
2765 #endif
2766  if (TCR_PTR(thread->th.th_task_team) == NULL) {
2767  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2768  __kmp_gtid_from_thread(thread)));
2769  continue;
2770  }
2771 #if KMP_OS_WINDOWS
2772  // TODO: GEH - add this check for Linux* OS / OS X* as well?
2773  if (!__kmp_is_thread_alive(thread, &exit_val)) {
2774  thread->th.th_task_team = NULL;
2775  continue;
2776  }
2777 #endif
2778 
2779  done = FALSE; // Because th_task_team pointer is not NULL for this thread
2780 
2781  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
2782  "unreference task_team\n",
2783  __kmp_gtid_from_thread(thread)));
2784 
2785  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2786  volatile void *sleep_loc;
2787  // If the thread is sleeping, awaken it.
2788  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
2789  NULL) {
2790  KA_TRACE(
2791  10,
2792  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2793  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
2794  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2795  }
2796  }
2797  }
2798  if (done) {
2799  break;
2800  }
2801 
2802  // If we are oversubscribed, or have waited a bit (and library mode is
2803  // throughput), yield. Pause is in the following code.
2804  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2805  KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput
2806  }
2807 }
2808 
2809 // __kmp_task_team_setup: Create a task_team for the current team, but use
2810 // an already created, unused one if it already exists.
2811 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
2812  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2813 
2814  // If this task_team hasn't been created yet, allocate it. It will be used in
2815  // the region after the next.
2816  // If it exists, it is the current task team and shouldn't be touched yet as
2817  // it may still be in use.
2818  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
2819  (always || team->t.t_nproc > 1)) {
2820  team->t.t_task_team[this_thr->th.th_task_state] =
2821  __kmp_allocate_task_team(this_thr, team);
2822  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
2823  "for team %d at parity=%d\n",
2824  __kmp_gtid_from_thread(this_thr),
2825  team->t.t_task_team[this_thr->th.th_task_state],
2826  ((team != NULL) ? team->t.t_id : -1),
2827  this_thr->th.th_task_state));
2828  }
2829 
2830  // After threads exit the release, they will call sync, and then point to this
2831  // other task_team; make sure it is allocated and properly initialized. As
2832  // threads spin in the barrier release phase, they will continue to use the
2833  // previous task_team struct(above), until they receive the signal to stop
2834  // checking for tasks (they can't safely reference the kmp_team_t struct,
2835  // which could be reallocated by the master thread). No task teams are formed
2836  // for serialized teams.
2837  if (team->t.t_nproc > 1) {
2838  int other_team = 1 - this_thr->th.th_task_state;
2839  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2840  team->t.t_task_team[other_team] =
2841  __kmp_allocate_task_team(this_thr, team);
2842  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
2843  "task_team %p for team %d at parity=%d\n",
2844  __kmp_gtid_from_thread(this_thr),
2845  team->t.t_task_team[other_team],
2846  ((team != NULL) ? team->t.t_id : -1), other_team));
2847  } else { // Leave the old task team struct in place for the upcoming region;
2848  // adjust as needed
2849  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2850  if (!task_team->tt.tt_active ||
2851  team->t.t_nproc != task_team->tt.tt_nproc) {
2852  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2853  TCW_4(task_team->tt.tt_found_tasks, FALSE);
2854 #if OMP_45_ENABLED
2855  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2856 #endif
2857  TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc);
2858  TCW_4(task_team->tt.tt_active, TRUE);
2859  }
2860  // if team size has changed, the first thread to enable tasking will
2861  // realloc threads_data if necessary
2862  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
2863  "%p for team %d at parity=%d\n",
2864  __kmp_gtid_from_thread(this_thr),
2865  team->t.t_task_team[other_team],
2866  ((team != NULL) ? team->t.t_id : -1), other_team));
2867  }
2868  }
2869 }
2870 
2871 // __kmp_task_team_sync: Propagation of task team data from team to threads
2872 // which happens just after the release phase of a team barrier. This may be
2873 // called by any thread, but only for teams with # threads > 1.
2874 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
2875  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2876 
2877  // Toggle the th_task_state field, to switch which task_team this thread
2878  // refers to
2879  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2880  // It is now safe to propagate the task team pointer from the team struct to
2881  // the current thread.
2882  TCW_PTR(this_thr->th.th_task_team,
2883  team->t.t_task_team[this_thr->th.th_task_state]);
2884  KA_TRACE(20,
2885  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
2886  "%p from Team #%d (parity=%d)\n",
2887  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
2888  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2889 }
2890 
2891 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
2892 // barrier gather phase. Only called by master thread if #threads in team > 1 or
2893 // if proxy tasks were created.
2894 //
2895 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
2896 // by passing in 0 optionally as the last argument. When wait is zero, master
2897 // thread does not wait for unfinished_threads to reach 0.
2898 void __kmp_task_team_wait(
2899  kmp_info_t *this_thr,
2900  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
2901  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2902 
2903  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2904  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
2905 
2906  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
2907  if (wait) {
2908  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
2909  "(for unfinished_threads to reach 0) on task_team = %p\n",
2910  __kmp_gtid_from_thread(this_thr), task_team));
2911  // Worker threads may have dropped through to release phase, but could
2912  // still be executing tasks. Wait here for tasks to complete. To avoid
2913  // memory contention, only master thread checks termination condition.
2914  kmp_flag_32 flag(
2915  RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads),
2916  0U);
2917  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2918  }
2919  // Deactivate the old task team, so that the worker threads will stop
2920  // referencing it while spinning.
2921  KA_TRACE(
2922  20,
2923  ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2924  "setting active to false, setting local and team's pointer to NULL\n",
2925  __kmp_gtid_from_thread(this_thr), task_team));
2926 #if OMP_45_ENABLED
2927  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
2928  task_team->tt.tt_found_proxy_tasks == TRUE);
2929  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2930 #else
2931  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1);
2932 #endif
2933  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
2934  KMP_MB();
2935 
2936  TCW_PTR(this_thr->th.th_task_team, NULL);
2937  }
2938 }
2939 
2940 // __kmp_tasking_barrier:
2941 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2942 // Internal function to execute all tasks prior to a regular barrier or a join
2943 // barrier. It is a full barrier itself, which unfortunately turns regular
2944 // barriers into double barriers and join barriers into 1 1/2 barriers.
2945 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
2946  volatile kmp_uint32 *spin = RCAST(
2947  volatile kmp_uint32 *,
2948  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
2949  int flag = FALSE;
2950  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
2951 
2952 #if USE_ITT_BUILD
2953  KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL);
2954 #endif /* USE_ITT_BUILD */
2955  kmp_flag_32 spin_flag(spin, 0U);
2956  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
2957  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
2958 #if USE_ITT_BUILD
2959  // TODO: What about itt_sync_obj??
2960  KMP_FSYNC_SPIN_PREPARE(CCAST(kmp_uint32 *, spin));
2961 #endif /* USE_ITT_BUILD */
2962 
2963  if (TCR_4(__kmp_global.g.g_done)) {
2964  if (__kmp_global.g.g_abort)
2965  __kmp_abort_thread();
2966  break;
2967  }
2968  KMP_YIELD(TRUE); // GH: We always yield here
2969  }
2970 #if USE_ITT_BUILD
2971  KMP_FSYNC_SPIN_ACQUIRED(CCAST(kmp_uint32 *, spin));
2972 #endif /* USE_ITT_BUILD */
2973 }
2974 
2975 #if OMP_45_ENABLED
2976 
2977 // __kmp_give_task puts a task into a given thread queue if:
2978 // - the queue for that thread was created
2979 // - there's space in that queue
2980 // Because of this, __kmp_push_task needs to check if there's space after
2981 // getting the lock
2982 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
2983  kmp_int32 pass) {
2984  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
2985  kmp_task_team_t *task_team = taskdata->td_task_team;
2986 
2987  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
2988  taskdata, tid));
2989 
2990  // If task_team is NULL something went really bad...
2991  KMP_DEBUG_ASSERT(task_team != NULL);
2992 
2993  bool result = false;
2994  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
2995 
2996  if (thread_data->td.td_deque == NULL) {
2997  // There's no queue in this thread, go find another one
2998  // We're guaranteed that at least one thread has a queue
2999  KA_TRACE(30,
3000  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3001  tid, taskdata));
3002  return result;
3003  }
3004 
3005  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3006  TASK_DEQUE_SIZE(thread_data->td)) {
3007  KA_TRACE(
3008  30,
3009  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3010  taskdata, tid));
3011 
3012  // if this deque is bigger than the pass ratio give a chance to another
3013  // thread
3014  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3015  return result;
3016 
3017  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3018  __kmp_realloc_task_deque(thread, thread_data);
3019 
3020  } else {
3021 
3022  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3023 
3024  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3025  TASK_DEQUE_SIZE(thread_data->td)) {
3026  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3027  "thread %d.\n",
3028  taskdata, tid));
3029 
3030  // if this deque is bigger than the pass ratio give a chance to another
3031  // thread
3032  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3033  goto release_and_exit;
3034 
3035  __kmp_realloc_task_deque(thread, thread_data);
3036  }
3037  }
3038 
3039  // lock is held here, and there is space in the deque
3040 
3041  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3042  // Wrap index.
3043  thread_data->td.td_deque_tail =
3044  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3045  TCW_4(thread_data->td.td_deque_ntasks,
3046  TCR_4(thread_data->td.td_deque_ntasks) + 1);
3047 
3048  result = true;
3049  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3050  taskdata, tid));
3051 
3052 release_and_exit:
3053  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3054 
3055  return result;
3056 }
3057 
3058 /* The finish of the proxy tasks is divided in two pieces:
3059  - the top half is the one that can be done from a thread outside the team
3060  - the bottom half must be run from a them within the team
3061 
3062  In order to run the bottom half the task gets queued back into one of the
3063  threads of the team. Once the td_incomplete_child_task counter of the parent
3064  is decremented the threads can leave the barriers. So, the bottom half needs
3065  to be queued before the counter is decremented. The top half is therefore
3066  divided in two parts:
3067  - things that can be run before queuing the bottom half
3068  - things that must be run after queuing the bottom half
3069 
3070  This creates a second race as the bottom half can free the task before the
3071  second top half is executed. To avoid this we use the
3072  td_incomplete_child_task of the proxy task to synchronize the top and bottom
3073  half. */
3074 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3075  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3076  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3077  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3078  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3079 
3080  taskdata->td_flags.complete = 1; // mark the task as completed
3081 
3082  if (taskdata->td_taskgroup)
3083  KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count);
3084 
3085  // Create an imaginary children for this task so the bottom half cannot
3086  // release the task before we have completed the second top half
3087  TCI_4(taskdata->td_incomplete_child_tasks);
3088 }
3089 
3090 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3091  kmp_int32 children = 0;
3092 
3093  // Predecrement simulated by "- 1" calculation
3094  children =
3095  KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3096  KMP_DEBUG_ASSERT(children >= 0);
3097 
3098  // Remove the imaginary children
3099  TCD_4(taskdata->td_incomplete_child_tasks);
3100 }
3101 
3102 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3103  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3104  kmp_info_t *thread = __kmp_threads[gtid];
3105 
3106  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3107  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3108  1); // top half must run before bottom half
3109 
3110  // We need to wait to make sure the top half is finished
3111  // Spinning here should be ok as this should happen quickly
3112  while (TCR_4(taskdata->td_incomplete_child_tasks) > 0)
3113  ;
3114 
3115  __kmp_release_deps(gtid, taskdata);
3116  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3117 }
3118 
3127 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3128  KMP_DEBUG_ASSERT(ptask != NULL);
3129  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3130  KA_TRACE(
3131  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3132  gtid, taskdata));
3133 
3134  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3135 
3136  __kmp_first_top_half_finish_proxy(taskdata);
3137  __kmp_second_top_half_finish_proxy(taskdata);
3138  __kmp_bottom_half_finish_proxy(gtid, ptask);
3139 
3140  KA_TRACE(10,
3141  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3142  gtid, taskdata));
3143 }
3144 
3152 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3153  KMP_DEBUG_ASSERT(ptask != NULL);
3154  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3155 
3156  KA_TRACE(
3157  10,
3158  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3159  taskdata));
3160 
3161  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3162 
3163  __kmp_first_top_half_finish_proxy(taskdata);
3164 
3165  // Enqueue task to complete bottom half completion from a thread within the
3166  // corresponding team
3167  kmp_team_t *team = taskdata->td_team;
3168  kmp_int32 nthreads = team->t.t_nproc;
3169  kmp_info_t *thread;
3170 
3171  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3172  // but we cannot use __kmp_get_random here
3173  kmp_int32 start_k = 0;
3174  kmp_int32 pass = 1;
3175  kmp_int32 k = start_k;
3176 
3177  do {
3178  // For now we're just linearly trying to find a thread
3179  thread = team->t.t_threads[k];
3180  k = (k + 1) % nthreads;
3181 
3182  // we did a full pass through all the threads
3183  if (k == start_k)
3184  pass = pass << 1;
3185 
3186  } while (!__kmp_give_task(thread, k, ptask, pass));
3187 
3188  __kmp_second_top_half_finish_proxy(taskdata);
3189 
3190  KA_TRACE(
3191  10,
3192  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3193  taskdata));
3194 }
3195 
3196 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3197 // for taskloop
3198 //
3199 // thread: allocating thread
3200 // task_src: pointer to source task to be duplicated
3201 // returns: a pointer to the allocated kmp_task_t structure (task).
3202 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3203  kmp_task_t *task;
3204  kmp_taskdata_t *taskdata;
3205  kmp_taskdata_t *taskdata_src;
3206  kmp_taskdata_t *parent_task = thread->th.th_current_task;
3207  size_t shareds_offset;
3208  size_t task_size;
3209 
3210  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3211  task_src));
3212  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3213  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3214  TASK_FULL); // it should not be proxy task
3215  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3216  task_size = taskdata_src->td_size_alloc;
3217 
3218  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3219  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3220  task_size));
3221 #if USE_FAST_MEMORY
3222  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3223 #else
3224  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3225 #endif /* USE_FAST_MEMORY */
3226  KMP_MEMCPY(taskdata, taskdata_src, task_size);
3227 
3228  task = KMP_TASKDATA_TO_TASK(taskdata);
3229 
3230  // Initialize new task (only specific fields not affected by memcpy)
3231  taskdata->td_task_id = KMP_GEN_TASK_ID();
3232  if (task->shareds != NULL) { // need setup shareds pointer
3233  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3234  task->shareds = &((char *)taskdata)[shareds_offset];
3235  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3236  0);
3237  }
3238  taskdata->td_alloc_thread = thread;
3239  taskdata->td_parent = parent_task;
3240  taskdata->td_taskgroup =
3241  parent_task
3242  ->td_taskgroup; // task inherits the taskgroup from the parent task
3243 
3244  // Only need to keep track of child task counts if team parallel and tasking
3245  // not serialized
3246  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3247  KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
3248  if (parent_task->td_taskgroup)
3249  KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count);
3250  // Only need to keep track of allocated child tasks for explicit tasks since
3251  // implicit not deallocated
3252  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3253  KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
3254  }
3255 
3256  KA_TRACE(20,
3257  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
3258  thread, taskdata, taskdata->td_parent));
3259 #if OMPT_SUPPORT
3260  __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid,
3261  (void *)task->routine);
3262 #endif
3263  return task;
3264 }
3265 
3266 // Routine optionally generated by the compiler for setting the lastprivate flag
3267 // and calling needed constructors for private/firstprivate objects
3268 // (used to form taskloop tasks from pattern task)
3269 // Parameters: dest task, src task, lastprivate flag.
3270 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
3271 
3272 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
3273 //
3274 // loc Source location information
3275 // gtid Global thread ID
3276 // task Pattern task, exposes the loop iteration range
3277 // lb Pointer to loop lower bound in task structure
3278 // ub Pointer to loop upper bound in task structure
3279 // st Loop stride
3280 // ub_glob Global upper bound (used for lastprivate check)
3281 // num_tasks Number of tasks to execute
3282 // grainsize Number of loop iterations per task
3283 // extras Number of chunks with grainsize+1 iterations
3284 // tc Iterations count
3285 // task_dup Tasks duplication routine
3286 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3287  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3288  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3289  kmp_uint64 grainsize, kmp_uint64 extras,
3290  kmp_uint64 tc, void *task_dup) {
3291  KMP_COUNT_BLOCK(OMP_TASKLOOP);
3292  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3293  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3294  kmp_uint64 lower = *lb; // compiler provides global bounds here
3295  kmp_uint64 upper = *ub;
3296  kmp_uint64 i;
3297  kmp_info_t *thread = __kmp_threads[gtid];
3298  kmp_taskdata_t *current_task = thread->th.th_current_task;
3299  kmp_task_t *next_task;
3300  kmp_int32 lastpriv = 0;
3301  size_t lower_offset =
3302  (char *)lb - (char *)task; // remember offset of lb in the task structure
3303  size_t upper_offset =
3304  (char *)ub - (char *)task; // remember offset of ub in the task structure
3305 
3306  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3307  KMP_DEBUG_ASSERT(num_tasks > extras);
3308  KMP_DEBUG_ASSERT(num_tasks > 0);
3309  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
3310  "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", gtid, num_tasks,
3311  grainsize, extras, lower, upper, ub_glob, st, task_dup));
3312 
3313  // Launch num_tasks tasks, assign grainsize iterations each task
3314  for (i = 0; i < num_tasks; ++i) {
3315  kmp_uint64 chunk_minus_1;
3316  if (extras == 0) {
3317  chunk_minus_1 = grainsize - 1;
3318  } else {
3319  chunk_minus_1 = grainsize;
3320  --extras; // first extras iterations get bigger chunk (grainsize+1)
3321  }
3322  upper = lower + st * chunk_minus_1;
3323  if (i == num_tasks - 1) {
3324  // schedule the last task, set lastprivate flag if needed
3325  if (st == 1) { // most common case
3326  KMP_DEBUG_ASSERT(upper == *ub);
3327  if (upper == ub_glob)
3328  lastpriv = 1;
3329  } else if (st > 0) { // positive loop stride
3330  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
3331  if ((kmp_uint64)st > ub_glob - upper)
3332  lastpriv = 1;
3333  } else { // negative loop stride
3334  KMP_DEBUG_ASSERT(upper + st < *ub);
3335  if (upper - ub_glob < (kmp_uint64)(-st))
3336  lastpriv = 1;
3337  }
3338  }
3339  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3340  // adjust task-specific bounds
3341  *(kmp_uint64 *)((char *)next_task + lower_offset) = lower;
3342  *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
3343  if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
3344  ptask_dup(next_task, task, lastpriv);
3345  KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, "
3346  "upper %lld (offsets %p %p)\n",
3347  gtid, next_task, lower, upper, lower_offset, upper_offset));
3348  __kmp_omp_task(gtid, next_task, true); // schedule new task
3349  lower = upper + st; // adjust lower bound for the next iteration
3350  }
3351  // free the pattern task and exit
3352  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
3353  // do not execute the pattern task, just do internal bookkeeping
3354  __kmp_task_finish(gtid, task, current_task);
3355 }
3356 
3357 // Structure to keep taskloop parameters for auxiliary task
3358 // kept in the shareds of the task structure.
3359 typedef struct __taskloop_params {
3360  kmp_task_t *task;
3361  kmp_uint64 *lb;
3362  kmp_uint64 *ub;
3363  void *task_dup;
3364  kmp_int64 st;
3365  kmp_uint64 ub_glob;
3366  kmp_uint64 num_tasks;
3367  kmp_uint64 grainsize;
3368  kmp_uint64 extras;
3369  kmp_uint64 tc;
3370  kmp_uint64 num_t_min;
3371 } __taskloop_params_t;
3372 
3373 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
3374  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
3375  kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
3376  void *);
3377 
3378 // Execute part of the the taskloop submitted as a task.
3379 int __kmp_taskloop_task(int gtid, void *ptask) {
3380  __taskloop_params_t *p = (__taskloop_params_t*)((kmp_task_t*)ptask)->shareds;
3381  kmp_task_t *task = p->task;
3382  kmp_uint64 *lb = p->lb;
3383  kmp_uint64 *ub = p->ub;
3384  void *task_dup = p->task_dup;
3385 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3386  kmp_int64 st = p->st;
3387  kmp_uint64 ub_glob = p->ub_glob;
3388  kmp_uint64 num_tasks = p->num_tasks;
3389  kmp_uint64 grainsize = p->grainsize;
3390  kmp_uint64 extras = p->extras;
3391  kmp_uint64 tc = p->tc;
3392  kmp_uint64 num_t_min = p->num_t_min;
3393 #if KMP_DEBUG
3394  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3395  KMP_DEBUG_ASSERT(task != NULL);
3396  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
3397  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
3398  num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
3399 #endif
3400  KMP_DEBUG_ASSERT(num_tasks*2+1 > num_t_min);
3401  if (num_tasks > num_t_min)
3402  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3403  grainsize, extras, tc, num_t_min, task_dup);
3404  else
3405  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
3406  grainsize, extras, tc, task_dup);
3407 
3408  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
3409  return 0;
3410 }
3411 
3412 // Schedule part of the the taskloop as a task,
3413 // execute the rest of the the taskloop.
3414 //
3415 // loc Source location information
3416 // gtid Global thread ID
3417 // task Pattern task, exposes the loop iteration range
3418 // lb Pointer to loop lower bound in task structure
3419 // ub Pointer to loop upper bound in task structure
3420 // st Loop stride
3421 // ub_glob Global upper bound (used for lastprivate check)
3422 // num_tasks Number of tasks to execute
3423 // grainsize Number of loop iterations per task
3424 // extras Number of chunks with grainsize+1 iterations
3425 // tc Iterations count
3426 // num_t_min Threashold to launch tasks recursively
3427 // task_dup Tasks duplication routine
3428 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
3429  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3430  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
3431  kmp_uint64 grainsize, kmp_uint64 extras,
3432  kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) {
3433 #if KMP_DEBUG
3434  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3435  KMP_DEBUG_ASSERT(task != NULL);
3436  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
3437  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
3438  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
3439  num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
3440 #endif
3441  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3442  kmp_uint64 lower = *lb;
3443  kmp_uint64 upper = *ub;
3444  kmp_info_t *thread = __kmp_threads[gtid];
3445 // kmp_taskdata_t *current_task = thread->th.th_current_task;
3446  kmp_task_t *next_task;
3447  kmp_int32 lastpriv = 0;
3448  size_t lower_offset =
3449  (char *)lb - (char *)task; // remember offset of lb in the task structure
3450  size_t upper_offset =
3451  (char *)ub - (char *)task; // remember offset of ub in the task structure
3452 
3453  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3454  KMP_DEBUG_ASSERT(num_tasks > extras);
3455  KMP_DEBUG_ASSERT(num_tasks > 0);
3456 
3457  // split the loop in two halves
3458  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
3459  kmp_uint64 gr_size0 = grainsize;
3460  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
3461  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
3462  if (n_tsk0 <= extras) {
3463  gr_size0++; // integrate extras into grainsize
3464  ext0 = 0; // no extra iters in 1st half
3465  ext1 = extras - n_tsk0; // remaining extras
3466  tc0 = gr_size0 * n_tsk0;
3467  tc1 = tc - tc0;
3468  } else { // n_tsk0 > extras
3469  ext1 = 0; // no extra iters in 2nd half
3470  ext0 = extras;
3471  tc1 = grainsize * n_tsk1;
3472  tc0 = tc - tc1;
3473  }
3474  ub0 = lower + st * (tc0 - 1);
3475  lb1 = ub0 + st;
3476 
3477  // create pattern task for 2nd half of the loop
3478  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
3479  // adjust lower bound (upper bound is not changed) for the 2nd half
3480  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
3481  if (ptask_dup != NULL) // construct fistprivates, etc.
3482  ptask_dup(next_task, task, 0);
3483  *ub = ub0; // adjust upper bound for the 1st half
3484 
3485  // create auxiliary task for 2nd half of the loop
3486  kmp_task_t *new_task =
3487  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void*),
3488  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
3489  __taskloop_params_t * p = (__taskloop_params_t *)new_task->shareds;
3490  p->task = next_task;
3491  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
3492  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
3493  p->task_dup = task_dup;
3494  p->st = st;
3495  p->ub_glob = ub_glob;
3496  p->num_tasks = n_tsk1;
3497  p->grainsize = grainsize;
3498  p->extras = ext1;
3499  p->tc = tc1;
3500  p->num_t_min = num_t_min;
3501  __kmp_omp_task(gtid, new_task, true); // schedule new task
3502 
3503  // execute the 1st half of current subrange
3504  if (n_tsk0 > num_t_min)
3505  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
3506  gr_size0, ext0, tc0, num_t_min, task_dup);
3507  else
3508  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
3509  gr_size0, ext0, tc0, task_dup);
3510 
3511  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
3512 }
3513 
3530 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3531  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
3532  int sched, kmp_uint64 grainsize, void *task_dup) {
3533  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3534  KMP_DEBUG_ASSERT(task != NULL);
3535 
3536  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
3537  "grain %llu(%d), dup %p\n", gtid, taskdata, *lb, *ub, st,
3538  grainsize, sched, task_dup));
3539 
3540  if (nogroup == 0)
3541  __kmpc_taskgroup(loc, gtid);
3542 
3543  // =========================================================================
3544  // calculate loop parameters
3545  kmp_uint64 tc;
3546  kmp_uint64 lower = *lb; // compiler provides global bounds here
3547  kmp_uint64 upper = *ub;
3548  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
3549  kmp_uint64 num_tasks = 0, extras = 0;
3550  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
3551  kmp_info_t *thread = __kmp_threads[gtid];
3552  kmp_taskdata_t *current_task = thread->th.th_current_task;
3553 
3554  // compute trip count
3555  if (st == 1) { // most common case
3556  tc = upper - lower + 1;
3557  } else if (st < 0) {
3558  tc = (lower - upper) / (-st) + 1;
3559  } else { // st > 0
3560  tc = (upper - lower) / st + 1;
3561  }
3562  if (tc == 0) {
3563  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3564  // free the pattern task and exit
3565  __kmp_task_start(gtid, task, current_task);
3566  // do not execute anything for zero-trip loop
3567  __kmp_task_finish(gtid, task, current_task);
3568  return;
3569  }
3570  if (num_tasks_min == 0)
3571  // TODO: can we choose better default heuristic?
3572  num_tasks_min = KMP_MIN(thread->th.th_team_nproc * 10,
3573  INITIAL_TASK_DEQUE_SIZE);
3574 
3575  // compute num_tasks/grainsize based on the input provided
3576  switch (sched) {
3577  case 0: // no schedule clause specified, we can choose the default
3578  // let's try to schedule (team_size*10) tasks
3579  grainsize = thread->th.th_team_nproc * 10;
3580  case 2: // num_tasks provided
3581  if (grainsize > tc) {
3582  num_tasks = tc; // too big num_tasks requested, adjust values
3583  grainsize = 1;
3584  extras = 0;
3585  } else {
3586  num_tasks = grainsize;
3587  grainsize = tc / num_tasks;
3588  extras = tc % num_tasks;
3589  }
3590  break;
3591  case 1: // grainsize provided
3592  if (grainsize > tc) {
3593  num_tasks = 1; // too big grainsize requested, adjust values
3594  grainsize = tc;
3595  extras = 0;
3596  } else {
3597  num_tasks = tc / grainsize;
3598  // adjust grainsize for balanced distribution of iterations
3599  grainsize = tc / num_tasks;
3600  extras = tc % num_tasks;
3601  }
3602  break;
3603  default:
3604  KMP_ASSERT2(0, "unknown scheduling of taskloop");
3605  }
3606  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3607  KMP_DEBUG_ASSERT(num_tasks > extras);
3608  KMP_DEBUG_ASSERT(num_tasks > 0);
3609  // =========================================================================
3610 
3611  // check if clause value first
3612  if (if_val == 0) { // if(0) specified, mark task as serial
3613  taskdata->td_flags.task_serial = 1;
3614  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3615  // always start serial tasks linearly
3616  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
3617  grainsize, extras, tc, task_dup);
3618  } else if (num_tasks > num_tasks_min) {
3619  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
3620  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
3621  num_tasks_min, grainsize, extras));
3622  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
3623  grainsize, extras, tc, num_tasks_min, task_dup);
3624  } else {
3625  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
3626  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
3627  num_tasks_min, grainsize, extras));
3628  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
3629  grainsize, extras, tc, task_dup);
3630  }
3631 
3632  if (nogroup == 0)
3633  __kmpc_end_taskgroup(loc, gtid);
3634  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3635 }
3636 
3637 #endif
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:805
Definition: kmp.h:208