LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46  KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
50 #if OMP_50_ENABLED
51  "5.0 (201611)";
52 #elif OMP_45_ENABLED
53  "4.5 (201511)";
54 #elif OMP_40_ENABLED
55  "4.0 (201307)";
56 #else
57  "3.1 (201107)";
58 #endif
59 
60 #ifdef KMP_DEBUG
61 char const __kmp_version_lock[] =
62  KMP_VERSION_PREFIX "lock type: run time selectable";
63 #endif /* KMP_DEBUG */
64 
65 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
66 
67 /* ------------------------------------------------------------------------ */
68 
69 #if KMP_USE_MONITOR
70 kmp_info_t __kmp_monitor;
71 #endif
72 
73 /* Forward declarations */
74 
75 void __kmp_cleanup(void);
76 
77 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
78  int gtid);
79 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
80  kmp_internal_control_t *new_icvs,
81  ident_t *loc);
82 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
83 static void __kmp_partition_places(kmp_team_t *team,
84  int update_master_only = 0);
85 #endif
86 static void __kmp_do_serial_initialize(void);
87 void __kmp_fork_barrier(int gtid, int tid);
88 void __kmp_join_barrier(int gtid);
89 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
90  kmp_internal_control_t *new_icvs, ident_t *loc);
91 
92 #ifdef USE_LOAD_BALANCE
93 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
94 #endif
95 
96 static int __kmp_expand_threads(int nNeed);
97 #if KMP_OS_WINDOWS
98 static int __kmp_unregister_root_other_thread(int gtid);
99 #endif
100 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108  int i;
109  kmp_info_t **other_threads;
110  size_t stack_data;
111  char *stack_addr;
112  size_t stack_size;
113  char *stack_base;
114 
115  KA_TRACE(
116  1000,
117  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
118  __kmp_nth, __kmp_all_nth));
119 
120  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123  __kmp_init_gtid for this to work. */
124 
125  if (!TCR_4(__kmp_init_gtid))
126  return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129  if (TCR_4(__kmp_gtid_mode) >= 3) {
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131  return __kmp_gtid;
132  }
133 #endif
134  if (TCR_4(__kmp_gtid_mode) >= 2) {
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136  return __kmp_gtid_get_specific();
137  }
138  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140  stack_addr = (char *)&stack_data;
141  other_threads = __kmp_threads;
142 
143  /* ATT: The code below is a source of potential bugs due to unsynchronized
144  access to __kmp_threads array. For example:
145  1. Current thread loads other_threads[i] to thr and checks it, it is
146  non-NULL.
147  2. Current thread is suspended by OS.
148  3. Another thread unregisters and finishes (debug versions of free()
149  may fill memory with something like 0xEF).
150  4. Current thread is resumed.
151  5. Current thread reads junk from *thr.
152  TODO: Fix it. --ln */
153 
154  for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157  if (!thr)
158  continue;
159 
160  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163  /* stack grows down -- search through all of the active threads */
164 
165  if (stack_addr <= stack_base) {
166  size_t stack_diff = stack_base - stack_addr;
167 
168  if (stack_diff <= stack_size) {
169  /* The only way we can be closer than the allocated */
170  /* stack size is if we are running on this thread. */
171  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172  return i;
173  }
174  }
175  }
176 
177  /* get specific to try and determine our gtid */
178  KA_TRACE(1000,
179  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180  "thread, using TLS\n"));
181  i = __kmp_gtid_get_specific();
182 
183  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
184 
185  /* if we havn't been assigned a gtid, then return code */
186  if (i < 0)
187  return i;
188 
189  /* dynamically updated stack window for uber threads to avoid get_specific
190  call */
191  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192  KMP_FATAL(StackOverflow, i);
193  }
194 
195  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196  if (stack_addr > stack_base) {
197  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200  stack_base);
201  } else {
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203  stack_base - stack_addr);
204  }
205 
206  /* Reprint stack bounds for ubermaster since they have been refined */
207  if (__kmp_storage_map) {
208  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211  other_threads[i]->th.th_info.ds.ds_stacksize,
212  "th_%d stack (refinement)", i);
213  }
214  return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218  int gtid;
219 
220  if (!__kmp_init_serial) {
221  gtid = KMP_GTID_DNE;
222  } else
223 #ifdef KMP_TDATA_GTID
224  if (TCR_4(__kmp_gtid_mode) >= 3) {
225  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226  gtid = __kmp_gtid;
227  } else
228 #endif
229  if (TCR_4(__kmp_gtid_mode) >= 2) {
230  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231  gtid = __kmp_gtid_get_specific();
232  } else {
233  KA_TRACE(1000,
234  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235  gtid = __kmp_get_global_thread_id();
236  }
237 
238  /* we must be a new uber master sibling thread */
239  if (gtid == KMP_GTID_DNE) {
240  KA_TRACE(10,
241  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242  "Registering a new gtid.\n"));
243  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244  if (!__kmp_init_serial) {
245  __kmp_do_serial_initialize();
246  gtid = __kmp_gtid_get_specific();
247  } else {
248  gtid = __kmp_register_root(FALSE);
249  }
250  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252  }
253 
254  KMP_DEBUG_ASSERT(gtid >= 0);
255 
256  return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261  int f;
262  char *stack_beg = NULL;
263  char *stack_end = NULL;
264  int gtid;
265 
266  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267  if (__kmp_storage_map) {
268  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271  gtid = __kmp_gtid_from_thread(th);
272 
273  if (gtid == KMP_GTID_MONITOR) {
274  __kmp_print_storage_map_gtid(
275  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276  "th_%s stack (%s)", "mon",
277  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278  } else {
279  __kmp_print_storage_map_gtid(
280  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%d stack (%s)", gtid,
282  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283  }
284  }
285 
286  /* No point in checking ubermaster threads since they use refinement and
287  * cannot overlap */
288  gtid = __kmp_gtid_from_thread(th);
289  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290  KA_TRACE(10,
291  ("__kmp_check_stack_overlap: performing extensive checking\n"));
292  if (stack_beg == NULL) {
293  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295  }
296 
297  for (f = 0; f < __kmp_threads_capacity; f++) {
298  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300  if (f_th && f_th != th) {
301  char *other_stack_end =
302  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303  char *other_stack_beg =
304  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308  /* Print the other stack values before the abort */
309  if (__kmp_storage_map)
310  __kmp_print_storage_map_gtid(
311  -1, other_stack_beg, other_stack_end,
312  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316  __kmp_msg_null);
317  }
318  }
319  }
320  }
321  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327  static int done = FALSE;
328 
329  while (!done) {
330  KMP_YIELD(TRUE);
331  }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337  char const *format, ...) {
338  char buffer[MAX_MESSAGE];
339  va_list ap;
340 
341  va_start(ap, format);
342  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343  p2, (unsigned long)size, format);
344  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345  __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347  int node;
348  if (gtid >= 0) {
349  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350  if (__kmp_storage_map_verbose) {
351  node = __kmp_get_host_node(p1);
352  if (node < 0) /* doesn't work, so don't try this next time */
353  __kmp_storage_map_verbose = FALSE;
354  else {
355  char *last;
356  int lastNode;
357  int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359  const int page_size = KMP_GET_PAGE_SIZE();
360 
361  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363  if (localProc >= 0)
364  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
365  localProc >> 1);
366  else
367  __kmp_printf_no_lock(" GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369  /* The more elaborate format is disabled for now because of the prctl
370  * hanging bug. */
371  do {
372  last = p1;
373  lastNode = node;
374  /* This loop collates adjacent pages with the same host node. */
375  do {
376  (char *)p1 += page_size;
377  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
379  lastNode);
380  } while (p1 <= p2);
381 #else
382  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
383  (char *)p1 + (page_size - 1),
384  __kmp_get_host_node(p1));
385  if (p1 < p2) {
386  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
387  (char *)p2 + (page_size - 1),
388  __kmp_get_host_node(p2));
389  }
390 #endif
391  }
392  }
393  } else
394  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
395  }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401  char buffer[MAX_MESSAGE];
402  va_list ap;
403 
404  if (__kmp_generate_warnings == kmp_warnings_off) {
405  return;
406  }
407 
408  va_start(ap, format);
409 
410  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412  __kmp_vprintf(kmp_err, buffer, ap);
413  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415  va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419  // Later threads may stall here, but that's ok because abort() will kill them.
420  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422  if (__kmp_debug_buf) {
423  __kmp_dump_debug_buffer();
424  }
425 
426  if (KMP_OS_WINDOWS) {
427  // Let other threads know of abnormal termination and prevent deadlock
428  // if abort happened during library initialization or shutdown
429  __kmp_global.g.g_abort = SIGABRT;
430 
431  /* On Windows* OS by default abort() causes pop-up error box, which stalls
432  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433  boxes. _set_abort_behavior() works well, but this function is not
434  available in VS7 (this is not problem for DLL, but it is a problem for
435  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436  help, at least in some versions of MS C RTL.
437 
438  It seems following sequence is the only way to simulate abort() and
439  avoid pop-up error box. */
440  raise(SIGABRT);
441  _exit(3); // Just in case, if signal ignored, exit anyway.
442  } else {
443  abort();
444  }
445 
446  __kmp_infinite_loop();
447  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
448 
449 } // __kmp_abort_process
450 
451 void __kmp_abort_thread(void) {
452  // TODO: Eliminate g_abort global variable and this function.
453  // In case of abort just call abort(), it will kill all the threads.
454  __kmp_infinite_loop();
455 } // __kmp_abort_thread
456 
457 /* Print out the storage map for the major kmp_info_t thread data structures
458  that are allocated together. */
459 
460 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
461  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
462  gtid);
463 
464  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
465  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
466 
467  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
468  sizeof(kmp_local_t), "th_%d.th_local", gtid);
469 
470  __kmp_print_storage_map_gtid(
471  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
472  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
473 
474  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
475  &thr->th.th_bar[bs_plain_barrier + 1],
476  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
477  gtid);
478 
479  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480  &thr->th.th_bar[bs_forkjoin_barrier + 1],
481  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
482  gtid);
483 
484 #if KMP_FAST_REDUCTION_BARRIER
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
486  &thr->th.th_bar[bs_reduction_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
488  gtid);
489 #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /* Print out the storage map for the major kmp_team_t team data structures
493  that are allocated together. */
494 
495 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
496  int team_id, int num_thr) {
497  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
498  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
499  header, team_id);
500 
501  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
502  &team->t.t_bar[bs_last_barrier],
503  sizeof(kmp_balign_team_t) * bs_last_barrier,
504  "%s_%d.t_bar", header, team_id);
505 
506  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
507  &team->t.t_bar[bs_plain_barrier + 1],
508  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
509  header, team_id);
510 
511  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
512  &team->t.t_bar[bs_forkjoin_barrier + 1],
513  sizeof(kmp_balign_team_t),
514  "%s_%d.t_bar[forkjoin]", header, team_id);
515 
516 #if KMP_FAST_REDUCTION_BARRIER
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
518  &team->t.t_bar[bs_reduction_barrier + 1],
519  sizeof(kmp_balign_team_t),
520  "%s_%d.t_bar[reduction]", header, team_id);
521 #endif // KMP_FAST_REDUCTION_BARRIER
522 
523  __kmp_print_storage_map_gtid(
524  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
525  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
526 
527  __kmp_print_storage_map_gtid(
528  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
529  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
530 
531  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
532  &team->t.t_disp_buffer[num_disp_buff],
533  sizeof(dispatch_shared_info_t) * num_disp_buff,
534  "%s_%d.t_disp_buffer", header, team_id);
535 }
536 
537 static void __kmp_init_allocator() {
538 #if OMP_50_ENABLED
539  __kmp_init_memkind();
540 #endif
541 }
542 static void __kmp_fini_allocator() {
543 #if OMP_50_ENABLED
544  __kmp_fini_memkind();
545 #endif
546 }
547 
548 /* ------------------------------------------------------------------------ */
549 
550 #if KMP_DYNAMIC_LIB
551 #if KMP_OS_WINDOWS
552 
553 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
554  // TODO: Change to __kmp_break_bootstrap_lock().
555  __kmp_init_bootstrap_lock(lck); // make the lock released
556 }
557 
558 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
559  int i;
560  int thread_count;
561 
562  // PROCESS_DETACH is expected to be called by a thread that executes
563  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
564  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
565  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
566  // threads can be still alive here, although being about to be terminated. The
567  // threads in the array with ds_thread==0 are most suspicious. Actually, it
568  // can be not safe to access the __kmp_threads[].
569 
570  // TODO: does it make sense to check __kmp_roots[] ?
571 
572  // Let's check that there are no other alive threads registered with the OMP
573  // lib.
574  while (1) {
575  thread_count = 0;
576  for (i = 0; i < __kmp_threads_capacity; ++i) {
577  if (!__kmp_threads)
578  continue;
579  kmp_info_t *th = __kmp_threads[i];
580  if (th == NULL)
581  continue;
582  int gtid = th->th.th_info.ds.ds_gtid;
583  if (gtid == gtid_req)
584  continue;
585  if (gtid < 0)
586  continue;
587  DWORD exit_val;
588  int alive = __kmp_is_thread_alive(th, &exit_val);
589  if (alive) {
590  ++thread_count;
591  }
592  }
593  if (thread_count == 0)
594  break; // success
595  }
596 
597  // Assume that I'm alone. Now it might be safe to check and reset locks.
598  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
599  __kmp_reset_lock(&__kmp_forkjoin_lock);
600 #ifdef KMP_DEBUG
601  __kmp_reset_lock(&__kmp_stdio_lock);
602 #endif // KMP_DEBUG
603 }
604 
605 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
606  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
607 
608  switch (fdwReason) {
609 
610  case DLL_PROCESS_ATTACH:
611  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
612 
613  return TRUE;
614 
615  case DLL_PROCESS_DETACH:
616  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
617 
618  if (lpReserved != NULL) {
619  // lpReserved is used for telling the difference:
620  // lpReserved == NULL when FreeLibrary() was called,
621  // lpReserved != NULL when the process terminates.
622  // When FreeLibrary() is called, worker threads remain alive. So they will
623  // release the forkjoin lock by themselves. When the process terminates,
624  // worker threads disappear triggering the problem of unreleased forkjoin
625  // lock as described below.
626 
627  // A worker thread can take the forkjoin lock. The problem comes up if
628  // that worker thread becomes dead before it releases the forkjoin lock.
629  // The forkjoin lock remains taken, while the thread executing
630  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
631  // to take the forkjoin lock and will always fail, so that the application
632  // will never finish [normally]. This scenario is possible if
633  // __kmpc_end() has not been executed. It looks like it's not a corner
634  // case, but common cases:
635  // - the main function was compiled by an alternative compiler;
636  // - the main function was compiled by icl but without /Qopenmp
637  // (application with plugins);
638  // - application terminates by calling C exit(), Fortran CALL EXIT() or
639  // Fortran STOP.
640  // - alive foreign thread prevented __kmpc_end from doing cleanup.
641  //
642  // This is a hack to work around the problem.
643  // TODO: !!! figure out something better.
644  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
645  }
646 
647  __kmp_internal_end_library(__kmp_gtid_get_specific());
648 
649  return TRUE;
650 
651  case DLL_THREAD_ATTACH:
652  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
653 
654  /* if we want to register new siblings all the time here call
655  * __kmp_get_gtid(); */
656  return TRUE;
657 
658  case DLL_THREAD_DETACH:
659  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
660 
661  __kmp_internal_end_thread(__kmp_gtid_get_specific());
662  return TRUE;
663  }
664 
665  return TRUE;
666 }
667 
668 #endif /* KMP_OS_WINDOWS */
669 #endif /* KMP_DYNAMIC_LIB */
670 
671 /* __kmp_parallel_deo -- Wait until it's our turn. */
672 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
673  int gtid = *gtid_ref;
674 #ifdef BUILD_PARALLEL_ORDERED
675  kmp_team_t *team = __kmp_team_from_gtid(gtid);
676 #endif /* BUILD_PARALLEL_ORDERED */
677 
678  if (__kmp_env_consistency_check) {
679  if (__kmp_threads[gtid]->th.th_root->r.r_active)
680 #if KMP_USE_DYNAMIC_LOCK
681  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
682 #else
683  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
684 #endif
685  }
686 #ifdef BUILD_PARALLEL_ORDERED
687  if (!team->t.t_serialized) {
688  KMP_MB();
689  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
690  NULL);
691  KMP_MB();
692  }
693 #endif /* BUILD_PARALLEL_ORDERED */
694 }
695 
696 /* __kmp_parallel_dxo -- Signal the next task. */
697 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
698  int gtid = *gtid_ref;
699 #ifdef BUILD_PARALLEL_ORDERED
700  int tid = __kmp_tid_from_gtid(gtid);
701  kmp_team_t *team = __kmp_team_from_gtid(gtid);
702 #endif /* BUILD_PARALLEL_ORDERED */
703 
704  if (__kmp_env_consistency_check) {
705  if (__kmp_threads[gtid]->th.th_root->r.r_active)
706  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
707  }
708 #ifdef BUILD_PARALLEL_ORDERED
709  if (!team->t.t_serialized) {
710  KMP_MB(); /* Flush all pending memory write invalidates. */
711 
712  /* use the tid of the next thread in this team */
713  /* TODO replace with general release procedure */
714  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
715 
716  KMP_MB(); /* Flush all pending memory write invalidates. */
717  }
718 #endif /* BUILD_PARALLEL_ORDERED */
719 }
720 
721 /* ------------------------------------------------------------------------ */
722 /* The BARRIER for a SINGLE process section is always explicit */
723 
724 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
725  int status;
726  kmp_info_t *th;
727  kmp_team_t *team;
728 
729  if (!TCR_4(__kmp_init_parallel))
730  __kmp_parallel_initialize();
731 
732 #if OMP_50_ENABLED
733  __kmp_resume_if_soft_paused();
734 #endif
735 
736  th = __kmp_threads[gtid];
737  team = th->th.th_team;
738  status = 0;
739 
740  th->th.th_ident = id_ref;
741 
742  if (team->t.t_serialized) {
743  status = 1;
744  } else {
745  kmp_int32 old_this = th->th.th_local.this_construct;
746 
747  ++th->th.th_local.this_construct;
748  /* try to set team count to thread count--success means thread got the
749  single block */
750  /* TODO: Should this be acquire or release? */
751  if (team->t.t_construct == old_this) {
752  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
753  th->th.th_local.this_construct);
754  }
755 #if USE_ITT_BUILD
756  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
757  KMP_MASTER_GTID(gtid) &&
758 #if OMP_40_ENABLED
759  th->th.th_teams_microtask == NULL &&
760 #endif
761  team->t.t_active_level ==
762  1) { // Only report metadata by master of active team at level 1
763  __kmp_itt_metadata_single(id_ref);
764  }
765 #endif /* USE_ITT_BUILD */
766  }
767 
768  if (__kmp_env_consistency_check) {
769  if (status && push_ws) {
770  __kmp_push_workshare(gtid, ct_psingle, id_ref);
771  } else {
772  __kmp_check_workshare(gtid, ct_psingle, id_ref);
773  }
774  }
775 #if USE_ITT_BUILD
776  if (status) {
777  __kmp_itt_single_start(gtid);
778  }
779 #endif /* USE_ITT_BUILD */
780  return status;
781 }
782 
783 void __kmp_exit_single(int gtid) {
784 #if USE_ITT_BUILD
785  __kmp_itt_single_end(gtid);
786 #endif /* USE_ITT_BUILD */
787  if (__kmp_env_consistency_check)
788  __kmp_pop_workshare(gtid, ct_psingle, NULL);
789 }
790 
791 /* determine if we can go parallel or must use a serialized parallel region and
792  * how many threads we can use
793  * set_nproc is the number of threads requested for the team
794  * returns 0 if we should serialize or only use one thread,
795  * otherwise the number of threads to use
796  * The forkjoin lock is held by the caller. */
797 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
798  int master_tid, int set_nthreads
799 #if OMP_40_ENABLED
800  ,
801  int enter_teams
802 #endif /* OMP_40_ENABLED */
803  ) {
804  int capacity;
805  int new_nthreads;
806  KMP_DEBUG_ASSERT(__kmp_init_serial);
807  KMP_DEBUG_ASSERT(root && parent_team);
808  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
809 
810  // If dyn-var is set, dynamically adjust the number of desired threads,
811  // according to the method specified by dynamic_mode.
812  new_nthreads = set_nthreads;
813  if (!get__dynamic_2(parent_team, master_tid)) {
814  ;
815  }
816 #ifdef USE_LOAD_BALANCE
817  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
818  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
819  if (new_nthreads == 1) {
820  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
821  "reservation to 1 thread\n",
822  master_tid));
823  return 1;
824  }
825  if (new_nthreads < set_nthreads) {
826  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
827  "reservation to %d threads\n",
828  master_tid, new_nthreads));
829  }
830  }
831 #endif /* USE_LOAD_BALANCE */
832  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
833  new_nthreads = __kmp_avail_proc - __kmp_nth +
834  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
835  if (new_nthreads <= 1) {
836  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
837  "reservation to 1 thread\n",
838  master_tid));
839  return 1;
840  }
841  if (new_nthreads < set_nthreads) {
842  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
843  "reservation to %d threads\n",
844  master_tid, new_nthreads));
845  } else {
846  new_nthreads = set_nthreads;
847  }
848  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
849  if (set_nthreads > 2) {
850  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
851  new_nthreads = (new_nthreads % set_nthreads) + 1;
852  if (new_nthreads == 1) {
853  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
854  "reservation to 1 thread\n",
855  master_tid));
856  return 1;
857  }
858  if (new_nthreads < set_nthreads) {
859  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
860  "reservation to %d threads\n",
861  master_tid, new_nthreads));
862  }
863  }
864  } else {
865  KMP_ASSERT(0);
866  }
867 
868  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
869  if (__kmp_nth + new_nthreads -
870  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
871  __kmp_max_nth) {
872  int tl_nthreads = __kmp_max_nth - __kmp_nth +
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
874  if (tl_nthreads <= 0) {
875  tl_nthreads = 1;
876  }
877 
878  // If dyn-var is false, emit a 1-time warning.
879  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
880  __kmp_reserve_warn = 1;
881  __kmp_msg(kmp_ms_warning,
882  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
883  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
884  }
885  if (tl_nthreads == 1) {
886  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
887  "reduced reservation to 1 thread\n",
888  master_tid));
889  return 1;
890  }
891  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
892  "reservation to %d threads\n",
893  master_tid, tl_nthreads));
894  new_nthreads = tl_nthreads;
895  }
896 
897  // Respect OMP_THREAD_LIMIT
898  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
899  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
900  if (cg_nthreads + new_nthreads -
901  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
902  max_cg_threads) {
903  int tl_nthreads = max_cg_threads - cg_nthreads +
904  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
905  if (tl_nthreads <= 0) {
906  tl_nthreads = 1;
907  }
908 
909  // If dyn-var is false, emit a 1-time warning.
910  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
911  __kmp_reserve_warn = 1;
912  __kmp_msg(kmp_ms_warning,
913  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
914  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
915  }
916  if (tl_nthreads == 1) {
917  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
918  "reduced reservation to 1 thread\n",
919  master_tid));
920  return 1;
921  }
922  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
923  "reservation to %d threads\n",
924  master_tid, tl_nthreads));
925  new_nthreads = tl_nthreads;
926  }
927 
928  // Check if the threads array is large enough, or needs expanding.
929  // See comment in __kmp_register_root() about the adjustment if
930  // __kmp_threads[0] == NULL.
931  capacity = __kmp_threads_capacity;
932  if (TCR_PTR(__kmp_threads[0]) == NULL) {
933  --capacity;
934  }
935  if (__kmp_nth + new_nthreads -
936  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
937  capacity) {
938  // Expand the threads array.
939  int slotsRequired = __kmp_nth + new_nthreads -
940  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
941  capacity;
942  int slotsAdded = __kmp_expand_threads(slotsRequired);
943  if (slotsAdded < slotsRequired) {
944  // The threads array was not expanded enough.
945  new_nthreads -= (slotsRequired - slotsAdded);
946  KMP_ASSERT(new_nthreads >= 1);
947 
948  // If dyn-var is false, emit a 1-time warning.
949  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
950  __kmp_reserve_warn = 1;
951  if (__kmp_tp_cached) {
952  __kmp_msg(kmp_ms_warning,
953  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
954  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
955  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
956  } else {
957  __kmp_msg(kmp_ms_warning,
958  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
959  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
960  }
961  }
962  }
963  }
964 
965 #ifdef KMP_DEBUG
966  if (new_nthreads == 1) {
967  KC_TRACE(10,
968  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
969  "dead roots and rechecking; requested %d threads\n",
970  __kmp_get_gtid(), set_nthreads));
971  } else {
972  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
973  " %d threads\n",
974  __kmp_get_gtid(), new_nthreads, set_nthreads));
975  }
976 #endif // KMP_DEBUG
977  return new_nthreads;
978 }
979 
980 /* Allocate threads from the thread pool and assign them to the new team. We are
981  assured that there are enough threads available, because we checked on that
982  earlier within critical section forkjoin */
983 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
984  kmp_info_t *master_th, int master_gtid) {
985  int i;
986  int use_hot_team;
987 
988  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
989  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
990  KMP_MB();
991 
992  /* first, let's setup the master thread */
993  master_th->th.th_info.ds.ds_tid = 0;
994  master_th->th.th_team = team;
995  master_th->th.th_team_nproc = team->t.t_nproc;
996  master_th->th.th_team_master = master_th;
997  master_th->th.th_team_serialized = FALSE;
998  master_th->th.th_dispatch = &team->t.t_dispatch[0];
999 
1000 /* make sure we are not the optimized hot team */
1001 #if KMP_NESTED_HOT_TEAMS
1002  use_hot_team = 0;
1003  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1004  if (hot_teams) { // hot teams array is not allocated if
1005  // KMP_HOT_TEAMS_MAX_LEVEL=0
1006  int level = team->t.t_active_level - 1; // index in array of hot teams
1007  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1008  if (master_th->th.th_teams_size.nteams > 1) {
1009  ++level; // level was not increased in teams construct for
1010  // team_of_masters
1011  }
1012  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1013  master_th->th.th_teams_level == team->t.t_level) {
1014  ++level; // level was not increased in teams construct for
1015  // team_of_workers before the parallel
1016  } // team->t.t_level will be increased inside parallel
1017  }
1018  if (level < __kmp_hot_teams_max_level) {
1019  if (hot_teams[level].hot_team) {
1020  // hot team has already been allocated for given level
1021  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1022  use_hot_team = 1; // the team is ready to use
1023  } else {
1024  use_hot_team = 0; // AC: threads are not allocated yet
1025  hot_teams[level].hot_team = team; // remember new hot team
1026  hot_teams[level].hot_team_nth = team->t.t_nproc;
1027  }
1028  } else {
1029  use_hot_team = 0;
1030  }
1031  }
1032 #else
1033  use_hot_team = team == root->r.r_hot_team;
1034 #endif
1035  if (!use_hot_team) {
1036 
1037  /* install the master thread */
1038  team->t.t_threads[0] = master_th;
1039  __kmp_initialize_info(master_th, team, 0, master_gtid);
1040 
1041  /* now, install the worker threads */
1042  for (i = 1; i < team->t.t_nproc; i++) {
1043 
1044  /* fork or reallocate a new thread and install it in team */
1045  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1046  team->t.t_threads[i] = thr;
1047  KMP_DEBUG_ASSERT(thr);
1048  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1049  /* align team and thread arrived states */
1050  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1051  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1052  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1053  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1054  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1055  team->t.t_bar[bs_plain_barrier].b_arrived));
1056 #if OMP_40_ENABLED
1057  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1058  thr->th.th_teams_level = master_th->th.th_teams_level;
1059  thr->th.th_teams_size = master_th->th.th_teams_size;
1060 #endif
1061  { // Initialize threads' barrier data.
1062  int b;
1063  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1064  for (b = 0; b < bs_last_barrier; ++b) {
1065  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1066  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1067 #if USE_DEBUGGER
1068  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1069 #endif
1070  }
1071  }
1072  }
1073 
1074 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1075  __kmp_partition_places(team);
1076 #endif
1077  }
1078 
1079 #if OMP_50_ENABLED
1080  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1081  for (i = 0; i < team->t.t_nproc; i++) {
1082  kmp_info_t *thr = team->t.t_threads[i];
1083  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1084  thr->th.th_prev_level != team->t.t_level) {
1085  team->t.t_display_affinity = 1;
1086  break;
1087  }
1088  }
1089  }
1090 #endif
1091 
1092  KMP_MB();
1093 }
1094 
1095 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1096 // Propagate any changes to the floating point control registers out to the team
1097 // We try to avoid unnecessary writes to the relevant cache line in the team
1098 // structure, so we don't make changes unless they are needed.
1099 inline static void propagateFPControl(kmp_team_t *team) {
1100  if (__kmp_inherit_fp_control) {
1101  kmp_int16 x87_fpu_control_word;
1102  kmp_uint32 mxcsr;
1103 
1104  // Get master values of FPU control flags (both X87 and vector)
1105  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1106  __kmp_store_mxcsr(&mxcsr);
1107  mxcsr &= KMP_X86_MXCSR_MASK;
1108 
1109  // There is no point looking at t_fp_control_saved here.
1110  // If it is TRUE, we still have to update the values if they are different
1111  // from those we now have. If it is FALSE we didn't save anything yet, but
1112  // our objective is the same. We have to ensure that the values in the team
1113  // are the same as those we have.
1114  // So, this code achieves what we need whether or not t_fp_control_saved is
1115  // true. By checking whether the value needs updating we avoid unnecessary
1116  // writes that would put the cache-line into a written state, causing all
1117  // threads in the team to have to read it again.
1118  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1119  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1120  // Although we don't use this value, other code in the runtime wants to know
1121  // whether it should restore them. So we must ensure it is correct.
1122  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1123  } else {
1124  // Similarly here. Don't write to this cache-line in the team structure
1125  // unless we have to.
1126  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1127  }
1128 }
1129 
1130 // Do the opposite, setting the hardware registers to the updated values from
1131 // the team.
1132 inline static void updateHWFPControl(kmp_team_t *team) {
1133  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1134  // Only reset the fp control regs if they have been changed in the team.
1135  // the parallel region that we are exiting.
1136  kmp_int16 x87_fpu_control_word;
1137  kmp_uint32 mxcsr;
1138  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1139  __kmp_store_mxcsr(&mxcsr);
1140  mxcsr &= KMP_X86_MXCSR_MASK;
1141 
1142  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1143  __kmp_clear_x87_fpu_status_word();
1144  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1145  }
1146 
1147  if (team->t.t_mxcsr != mxcsr) {
1148  __kmp_load_mxcsr(&team->t.t_mxcsr);
1149  }
1150  }
1151 }
1152 #else
1153 #define propagateFPControl(x) ((void)0)
1154 #define updateHWFPControl(x) ((void)0)
1155 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1156 
1157 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1158  int realloc); // forward declaration
1159 
1160 /* Run a parallel region that has been serialized, so runs only in a team of the
1161  single master thread. */
1162 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1163  kmp_info_t *this_thr;
1164  kmp_team_t *serial_team;
1165 
1166  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1167 
1168  /* Skip all this code for autopar serialized loops since it results in
1169  unacceptable overhead */
1170  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1171  return;
1172 
1173  if (!TCR_4(__kmp_init_parallel))
1174  __kmp_parallel_initialize();
1175 
1176 #if OMP_50_ENABLED
1177  __kmp_resume_if_soft_paused();
1178 #endif
1179 
1180  this_thr = __kmp_threads[global_tid];
1181  serial_team = this_thr->th.th_serial_team;
1182 
1183  /* utilize the serialized team held by this thread */
1184  KMP_DEBUG_ASSERT(serial_team);
1185  KMP_MB();
1186 
1187  if (__kmp_tasking_mode != tskm_immediate_exec) {
1188  KMP_DEBUG_ASSERT(
1189  this_thr->th.th_task_team ==
1190  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1191  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1192  NULL);
1193  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1194  "team %p, new task_team = NULL\n",
1195  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1196  this_thr->th.th_task_team = NULL;
1197  }
1198 
1199 #if OMP_40_ENABLED
1200  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1201  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1202  proc_bind = proc_bind_false;
1203  } else if (proc_bind == proc_bind_default) {
1204  // No proc_bind clause was specified, so use the current value
1205  // of proc-bind-var for this parallel region.
1206  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1207  }
1208  // Reset for next parallel region
1209  this_thr->th.th_set_proc_bind = proc_bind_default;
1210 #endif /* OMP_40_ENABLED */
1211 
1212 #if OMPT_SUPPORT
1213  ompt_data_t ompt_parallel_data = ompt_data_none;
1214  ompt_data_t *implicit_task_data;
1215  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1216  if (ompt_enabled.enabled &&
1217  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1218 
1219  ompt_task_info_t *parent_task_info;
1220  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1221 
1222  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1223  if (ompt_enabled.ompt_callback_parallel_begin) {
1224  int team_size = 1;
1225 
1226  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1227  &(parent_task_info->task_data), &(parent_task_info->frame),
1228  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1229  codeptr);
1230  }
1231  }
1232 #endif // OMPT_SUPPORT
1233 
1234  if (this_thr->th.th_team != serial_team) {
1235  // Nested level will be an index in the nested nthreads array
1236  int level = this_thr->th.th_team->t.t_level;
1237 
1238  if (serial_team->t.t_serialized) {
1239  /* this serial team was already used
1240  TODO increase performance by making this locks more specific */
1241  kmp_team_t *new_team;
1242 
1243  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1244 
1245  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1246 #if OMPT_SUPPORT
1247  ompt_parallel_data,
1248 #endif
1249 #if OMP_40_ENABLED
1250  proc_bind,
1251 #endif
1252  &this_thr->th.th_current_task->td_icvs,
1253  0 USE_NESTED_HOT_ARG(NULL));
1254  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1255  KMP_ASSERT(new_team);
1256 
1257  /* setup new serialized team and install it */
1258  new_team->t.t_threads[0] = this_thr;
1259  new_team->t.t_parent = this_thr->th.th_team;
1260  serial_team = new_team;
1261  this_thr->th.th_serial_team = serial_team;
1262 
1263  KF_TRACE(
1264  10,
1265  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1266  global_tid, serial_team));
1267 
1268  /* TODO the above breaks the requirement that if we run out of resources,
1269  then we can still guarantee that serialized teams are ok, since we may
1270  need to allocate a new one */
1271  } else {
1272  KF_TRACE(
1273  10,
1274  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1275  global_tid, serial_team));
1276  }
1277 
1278  /* we have to initialize this serial team */
1279  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1280  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1281  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1282  serial_team->t.t_ident = loc;
1283  serial_team->t.t_serialized = 1;
1284  serial_team->t.t_nproc = 1;
1285  serial_team->t.t_parent = this_thr->th.th_team;
1286  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1287  this_thr->th.th_team = serial_team;
1288  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1289 
1290  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1291  this_thr->th.th_current_task));
1292  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1293  this_thr->th.th_current_task->td_flags.executing = 0;
1294 
1295  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1296 
1297  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1298  implicit task for each serialized task represented by
1299  team->t.t_serialized? */
1300  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1301  &this_thr->th.th_current_task->td_parent->td_icvs);
1302 
1303  // Thread value exists in the nested nthreads array for the next nested
1304  // level
1305  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1306  this_thr->th.th_current_task->td_icvs.nproc =
1307  __kmp_nested_nth.nth[level + 1];
1308  }
1309 
1310 #if OMP_40_ENABLED
1311  if (__kmp_nested_proc_bind.used &&
1312  (level + 1 < __kmp_nested_proc_bind.used)) {
1313  this_thr->th.th_current_task->td_icvs.proc_bind =
1314  __kmp_nested_proc_bind.bind_types[level + 1];
1315  }
1316 #endif /* OMP_40_ENABLED */
1317 
1318 #if USE_DEBUGGER
1319  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1320 #endif
1321  this_thr->th.th_info.ds.ds_tid = 0;
1322 
1323  /* set thread cache values */
1324  this_thr->th.th_team_nproc = 1;
1325  this_thr->th.th_team_master = this_thr;
1326  this_thr->th.th_team_serialized = 1;
1327 
1328  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1329  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1330 #if OMP_50_ENABLED
1331  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1332 #endif
1333 
1334  propagateFPControl(serial_team);
1335 
1336  /* check if we need to allocate dispatch buffers stack */
1337  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1338  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1339  serial_team->t.t_dispatch->th_disp_buffer =
1340  (dispatch_private_info_t *)__kmp_allocate(
1341  sizeof(dispatch_private_info_t));
1342  }
1343  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1344 
1345  KMP_MB();
1346 
1347  } else {
1348  /* this serialized team is already being used,
1349  * that's fine, just add another nested level */
1350  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1351  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1352  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1353  ++serial_team->t.t_serialized;
1354  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1355 
1356  // Nested level will be an index in the nested nthreads array
1357  int level = this_thr->th.th_team->t.t_level;
1358  // Thread value exists in the nested nthreads array for the next nested
1359  // level
1360  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1361  this_thr->th.th_current_task->td_icvs.nproc =
1362  __kmp_nested_nth.nth[level + 1];
1363  }
1364  serial_team->t.t_level++;
1365  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1366  "of serial team %p to %d\n",
1367  global_tid, serial_team, serial_team->t.t_level));
1368 
1369  /* allocate/push dispatch buffers stack */
1370  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1371  {
1372  dispatch_private_info_t *disp_buffer =
1373  (dispatch_private_info_t *)__kmp_allocate(
1374  sizeof(dispatch_private_info_t));
1375  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1376  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1377  }
1378  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1379 
1380  KMP_MB();
1381  }
1382 #if OMP_40_ENABLED
1383  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1384 #endif
1385 
1386 #if OMP_50_ENABLED
1387  // Perform the display affinity functionality for
1388  // serialized parallel regions
1389  if (__kmp_display_affinity) {
1390  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1391  this_thr->th.th_prev_num_threads != 1) {
1392  // NULL means use the affinity-format-var ICV
1393  __kmp_aux_display_affinity(global_tid, NULL);
1394  this_thr->th.th_prev_level = serial_team->t.t_level;
1395  this_thr->th.th_prev_num_threads = 1;
1396  }
1397  }
1398 #endif
1399 
1400  if (__kmp_env_consistency_check)
1401  __kmp_push_parallel(global_tid, NULL);
1402 #if OMPT_SUPPORT
1403  serial_team->t.ompt_team_info.master_return_address = codeptr;
1404  if (ompt_enabled.enabled &&
1405  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1406  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1407 
1408  ompt_lw_taskteam_t lw_taskteam;
1409  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1410  &ompt_parallel_data, codeptr);
1411 
1412  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1413  // don't use lw_taskteam after linking. content was swaped
1414 
1415  /* OMPT implicit task begin */
1416  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1417  if (ompt_enabled.ompt_callback_implicit_task) {
1418  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1419  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1420  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1421  OMPT_CUR_TASK_INFO(this_thr)
1422  ->thread_num = __kmp_tid_from_gtid(global_tid);
1423  }
1424 
1425  /* OMPT state */
1426  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1427  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1428  }
1429 #endif
1430 }
1431 
1432 /* most of the work for a fork */
1433 /* return true if we really went parallel, false if serialized */
1434 int __kmp_fork_call(ident_t *loc, int gtid,
1435  enum fork_context_e call_context, // Intel, GNU, ...
1436  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1437 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1438 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1439  va_list *ap
1440 #else
1441  va_list ap
1442 #endif
1443  ) {
1444  void **argv;
1445  int i;
1446  int master_tid;
1447  int master_this_cons;
1448  kmp_team_t *team;
1449  kmp_team_t *parent_team;
1450  kmp_info_t *master_th;
1451  kmp_root_t *root;
1452  int nthreads;
1453  int master_active;
1454  int master_set_numthreads;
1455  int level;
1456 #if OMP_40_ENABLED
1457  int active_level;
1458  int teams_level;
1459 #endif
1460 #if KMP_NESTED_HOT_TEAMS
1461  kmp_hot_team_ptr_t **p_hot_teams;
1462 #endif
1463  { // KMP_TIME_BLOCK
1464  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1465  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1466 
1467  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1468  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1469  /* Some systems prefer the stack for the root thread(s) to start with */
1470  /* some gap from the parent stack to prevent false sharing. */
1471  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1472  /* These 2 lines below are so this does not get optimized out */
1473  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1474  __kmp_stkpadding += (short)((kmp_int64)dummy);
1475  }
1476 
1477  /* initialize if needed */
1478  KMP_DEBUG_ASSERT(
1479  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1480  if (!TCR_4(__kmp_init_parallel))
1481  __kmp_parallel_initialize();
1482 
1483 #if OMP_50_ENABLED
1484  __kmp_resume_if_soft_paused();
1485 #endif
1486 
1487  /* setup current data */
1488  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1489  // shutdown
1490  parent_team = master_th->th.th_team;
1491  master_tid = master_th->th.th_info.ds.ds_tid;
1492  master_this_cons = master_th->th.th_local.this_construct;
1493  root = master_th->th.th_root;
1494  master_active = root->r.r_active;
1495  master_set_numthreads = master_th->th.th_set_nproc;
1496 
1497 #if OMPT_SUPPORT
1498  ompt_data_t ompt_parallel_data = ompt_data_none;
1499  ompt_data_t *parent_task_data;
1500  ompt_frame_t *ompt_frame;
1501  ompt_data_t *implicit_task_data;
1502  void *return_address = NULL;
1503 
1504  if (ompt_enabled.enabled) {
1505  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1506  NULL, NULL);
1507  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1508  }
1509 #endif
1510 
1511  // Nested level will be an index in the nested nthreads array
1512  level = parent_team->t.t_level;
1513  // used to launch non-serial teams even if nested is not allowed
1514  active_level = parent_team->t.t_active_level;
1515 #if OMP_40_ENABLED
1516  // needed to check nesting inside the teams
1517  teams_level = master_th->th.th_teams_level;
1518 #endif
1519 #if KMP_NESTED_HOT_TEAMS
1520  p_hot_teams = &master_th->th.th_hot_teams;
1521  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1522  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1523  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1524  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1525  // it is either actual or not needed (when active_level > 0)
1526  (*p_hot_teams)[0].hot_team_nth = 1;
1527  }
1528 #endif
1529 
1530 #if OMPT_SUPPORT
1531  if (ompt_enabled.enabled) {
1532  if (ompt_enabled.ompt_callback_parallel_begin) {
1533  int team_size = master_set_numthreads
1534  ? master_set_numthreads
1535  : get__nproc_2(parent_team, master_tid);
1536  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1537  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1538  OMPT_INVOKER(call_context), return_address);
1539  }
1540  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1541  }
1542 #endif
1543 
1544  master_th->th.th_ident = loc;
1545 
1546 #if OMP_40_ENABLED
1547  if (master_th->th.th_teams_microtask && ap &&
1548  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1549  // AC: This is start of parallel that is nested inside teams construct.
1550  // The team is actual (hot), all workers are ready at the fork barrier.
1551  // No lock needed to initialize the team a bit, then free workers.
1552  parent_team->t.t_ident = loc;
1553  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1554  parent_team->t.t_argc = argc;
1555  argv = (void **)parent_team->t.t_argv;
1556  for (i = argc - 1; i >= 0; --i)
1557 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1558 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1559  *argv++ = va_arg(*ap, void *);
1560 #else
1561  *argv++ = va_arg(ap, void *);
1562 #endif
1563  // Increment our nested depth levels, but not increase the serialization
1564  if (parent_team == master_th->th.th_serial_team) {
1565  // AC: we are in serialized parallel
1566  __kmpc_serialized_parallel(loc, gtid);
1567  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1568  // AC: need this in order enquiry functions work
1569  // correctly, will restore at join time
1570  parent_team->t.t_serialized--;
1571 #if OMPT_SUPPORT
1572  void *dummy;
1573  void **exit_runtime_p;
1574 
1575  ompt_lw_taskteam_t lw_taskteam;
1576 
1577  if (ompt_enabled.enabled) {
1578  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1579  &ompt_parallel_data, return_address);
1580  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1581 
1582  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1583  // don't use lw_taskteam after linking. content was swaped
1584 
1585  /* OMPT implicit task begin */
1586  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1587  if (ompt_enabled.ompt_callback_implicit_task) {
1588  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1589  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1590  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1591  OMPT_CUR_TASK_INFO(master_th)
1592  ->thread_num = __kmp_tid_from_gtid(gtid);
1593  }
1594 
1595  /* OMPT state */
1596  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1597  } else {
1598  exit_runtime_p = &dummy;
1599  }
1600 #endif
1601 
1602  {
1603  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1604  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1605  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1606 #if OMPT_SUPPORT
1607  ,
1608  exit_runtime_p
1609 #endif
1610  );
1611  }
1612 
1613 #if OMPT_SUPPORT
1614  *exit_runtime_p = NULL;
1615  if (ompt_enabled.enabled) {
1616  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1617  if (ompt_enabled.ompt_callback_implicit_task) {
1618  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1619  ompt_scope_end, NULL, implicit_task_data, 1,
1620  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1621  }
1622  __ompt_lw_taskteam_unlink(master_th);
1623 
1624  if (ompt_enabled.ompt_callback_parallel_end) {
1625  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1626  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1627  OMPT_INVOKER(call_context), return_address);
1628  }
1629  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1630  }
1631 #endif
1632  return TRUE;
1633  }
1634 
1635  parent_team->t.t_pkfn = microtask;
1636  parent_team->t.t_invoke = invoker;
1637  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1638  parent_team->t.t_active_level++;
1639  parent_team->t.t_level++;
1640 #if OMP_50_ENABLED
1641  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1642 #endif
1643 
1644  /* Change number of threads in the team if requested */
1645  if (master_set_numthreads) { // The parallel has num_threads clause
1646  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1647  // AC: only can reduce number of threads dynamically, can't increase
1648  kmp_info_t **other_threads = parent_team->t.t_threads;
1649  parent_team->t.t_nproc = master_set_numthreads;
1650  for (i = 0; i < master_set_numthreads; ++i) {
1651  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1652  }
1653  // Keep extra threads hot in the team for possible next parallels
1654  }
1655  master_th->th.th_set_nproc = 0;
1656  }
1657 
1658 #if USE_DEBUGGER
1659  if (__kmp_debugging) { // Let debugger override number of threads.
1660  int nth = __kmp_omp_num_threads(loc);
1661  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1662  master_set_numthreads = nth;
1663  }
1664  }
1665 #endif
1666 
1667  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1668  "master_th=%p, gtid=%d\n",
1669  root, parent_team, master_th, gtid));
1670  __kmp_internal_fork(loc, gtid, parent_team);
1671  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1672  "master_th=%p, gtid=%d\n",
1673  root, parent_team, master_th, gtid));
1674 
1675  /* Invoke microtask for MASTER thread */
1676  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1677  parent_team->t.t_id, parent_team->t.t_pkfn));
1678 
1679  if (!parent_team->t.t_invoke(gtid)) {
1680  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1681  }
1682  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1683  parent_team->t.t_id, parent_team->t.t_pkfn));
1684  KMP_MB(); /* Flush all pending memory write invalidates. */
1685 
1686  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1687 
1688  return TRUE;
1689  } // Parallel closely nested in teams construct
1690 #endif /* OMP_40_ENABLED */
1691 
1692 #if KMP_DEBUG
1693  if (__kmp_tasking_mode != tskm_immediate_exec) {
1694  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1695  parent_team->t.t_task_team[master_th->th.th_task_state]);
1696  }
1697 #endif
1698 
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703 #if OMP_40_ENABLED
1704  int enter_teams = ((ap == NULL && active_level == 0) ||
1705  (ap && teams_level > 0 && teams_level == level));
1706 #endif
1707  nthreads =
1708  master_set_numthreads
1709  ? master_set_numthreads
1710  : get__nproc_2(
1711  parent_team,
1712  master_tid); // TODO: get nproc directly from current task
1713 
1714  // Check if we need to take forkjoin lock? (no need for serialized
1715  // parallel out of teams construct). This code moved here from
1716  // __kmp_reserve_threads() to speedup nested serialized parallels.
1717  if (nthreads > 1) {
1718  if ((get__max_active_levels(master_th) == 1 && (root->r.r_in_parallel
1719 #if OMP_40_ENABLED
1720  && !enter_teams
1721 #endif /* OMP_40_ENABLED */
1722  )) ||
1723  (__kmp_library == library_serial)) {
1724  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1725  " threads\n",
1726  gtid, nthreads));
1727  nthreads = 1;
1728  }
1729  }
1730  if (nthreads > 1) {
1731  /* determine how many new threads we can use */
1732  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1733  nthreads = __kmp_reserve_threads(
1734  root, parent_team, master_tid, nthreads
1735 #if OMP_40_ENABLED
1736  /* AC: If we execute teams from parallel region (on host), then
1737  teams should be created but each can only have 1 thread if
1738  nesting is disabled. If teams called from serial region, then
1739  teams and their threads should be created regardless of the
1740  nesting setting. */
1741  ,
1742  enter_teams
1743 #endif /* OMP_40_ENABLED */
1744  );
1745  if (nthreads == 1) {
1746  // Free lock for single thread execution here; for multi-thread
1747  // execution it will be freed later after team of threads created
1748  // and initialized
1749  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1750  }
1751  }
1752  }
1753  KMP_DEBUG_ASSERT(nthreads > 0);
1754 
1755  // If we temporarily changed the set number of threads then restore it now
1756  master_th->th.th_set_nproc = 0;
1757 
1758  /* create a serialized parallel region? */
1759  if (nthreads == 1) {
1760 /* josh todo: hypothetical question: what do we do for OS X*? */
1761 #if KMP_OS_LINUX && \
1762  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1763  void *args[argc];
1764 #else
1765  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1766 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1767  KMP_ARCH_AARCH64) */
1768 
1769  KA_TRACE(20,
1770  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1771 
1772  __kmpc_serialized_parallel(loc, gtid);
1773 
1774  if (call_context == fork_context_intel) {
1775  /* TODO this sucks, use the compiler itself to pass args! :) */
1776  master_th->th.th_serial_team->t.t_ident = loc;
1777 #if OMP_40_ENABLED
1778  if (!ap) {
1779  // revert change made in __kmpc_serialized_parallel()
1780  master_th->th.th_serial_team->t.t_level--;
1781 // Get args from parent team for teams construct
1782 
1783 #if OMPT_SUPPORT
1784  void *dummy;
1785  void **exit_runtime_p;
1786  ompt_task_info_t *task_info;
1787 
1788  ompt_lw_taskteam_t lw_taskteam;
1789 
1790  if (ompt_enabled.enabled) {
1791  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1792  &ompt_parallel_data, return_address);
1793 
1794  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1795  // don't use lw_taskteam after linking. content was swaped
1796 
1797  task_info = OMPT_CUR_TASK_INFO(master_th);
1798  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1799  if (ompt_enabled.ompt_callback_implicit_task) {
1800  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1803  OMPT_CUR_TASK_INFO(master_th)
1804  ->thread_num = __kmp_tid_from_gtid(gtid);
1805  }
1806 
1807  /* OMPT state */
1808  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809  } else {
1810  exit_runtime_p = &dummy;
1811  }
1812 #endif
1813 
1814  {
1815  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1818  parent_team->t.t_argv
1819 #if OMPT_SUPPORT
1820  ,
1821  exit_runtime_p
1822 #endif
1823  );
1824  }
1825 
1826 #if OMPT_SUPPORT
1827  if (ompt_enabled.enabled) {
1828  exit_runtime_p = NULL;
1829  if (ompt_enabled.ompt_callback_implicit_task) {
1830  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1831  ompt_scope_end, NULL, &(task_info->task_data), 1,
1832  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1833  }
1834 
1835  __ompt_lw_taskteam_unlink(master_th);
1836  if (ompt_enabled.ompt_callback_parallel_end) {
1837  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1839  OMPT_INVOKER(call_context), return_address);
1840  }
1841  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842  }
1843 #endif
1844  } else if (microtask == (microtask_t)__kmp_teams_master) {
1845  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1846  master_th->th.th_serial_team);
1847  team = master_th->th.th_team;
1848  // team->t.t_pkfn = microtask;
1849  team->t.t_invoke = invoker;
1850  __kmp_alloc_argv_entries(argc, team, TRUE);
1851  team->t.t_argc = argc;
1852  argv = (void **)team->t.t_argv;
1853  if (ap) {
1854  for (i = argc - 1; i >= 0; --i)
1855 // TODO: revert workaround for Intel(R) 64 tracker #96
1856 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1857  *argv++ = va_arg(*ap, void *);
1858 #else
1859  *argv++ = va_arg(ap, void *);
1860 #endif
1861  } else {
1862  for (i = 0; i < argc; ++i)
1863  // Get args from parent team for teams construct
1864  argv[i] = parent_team->t.t_argv[i];
1865  }
1866  // AC: revert change made in __kmpc_serialized_parallel()
1867  // because initial code in teams should have level=0
1868  team->t.t_level--;
1869  // AC: call special invoker for outer "parallel" of teams construct
1870  invoker(gtid);
1871  } else {
1872 #endif /* OMP_40_ENABLED */
1873  argv = args;
1874  for (i = argc - 1; i >= 0; --i)
1875 // TODO: revert workaround for Intel(R) 64 tracker #96
1876 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1877  *argv++ = va_arg(*ap, void *);
1878 #else
1879  *argv++ = va_arg(ap, void *);
1880 #endif
1881  KMP_MB();
1882 
1883 #if OMPT_SUPPORT
1884  void *dummy;
1885  void **exit_runtime_p;
1886  ompt_task_info_t *task_info;
1887 
1888  ompt_lw_taskteam_t lw_taskteam;
1889 
1890  if (ompt_enabled.enabled) {
1891  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1892  &ompt_parallel_data, return_address);
1893  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1894  // don't use lw_taskteam after linking. content was swaped
1895  task_info = OMPT_CUR_TASK_INFO(master_th);
1896  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1897 
1898  /* OMPT implicit task begin */
1899  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1900  if (ompt_enabled.ompt_callback_implicit_task) {
1901  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1902  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1903  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1904  OMPT_CUR_TASK_INFO(master_th)
1905  ->thread_num = __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_runtime_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_runtime_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_runtime_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1933  }
1934 
1935  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1936  __ompt_lw_taskteam_unlink(master_th);
1937  if (ompt_enabled.ompt_callback_parallel_end) {
1938  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1939  &ompt_parallel_data, parent_task_data,
1940  OMPT_INVOKER(call_context), return_address);
1941  }
1942  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1943  }
1944 #endif
1945 #if OMP_40_ENABLED
1946  }
1947 #endif /* OMP_40_ENABLED */
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984 #if OMP_40_ENABLED
1985  if (!master_th->th.th_teams_microtask || level > teams_level)
1986 #endif /* OMP_40_ENABLED */
1987  {
1988  /* Increment our nested depth level */
1989  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1990  }
1991 
1992  // See if we need to make a copy of the ICVs.
1993  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1994  if ((level + 1 < __kmp_nested_nth.used) &&
1995  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1996  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1997  } else {
1998  nthreads_icv = 0; // don't update
1999  }
2000 
2001 #if OMP_40_ENABLED
2002  // Figure out the proc_bind_policy for the new team.
2003  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2004  kmp_proc_bind_t proc_bind_icv =
2005  proc_bind_default; // proc_bind_default means don't update
2006  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2007  proc_bind = proc_bind_false;
2008  } else {
2009  if (proc_bind == proc_bind_default) {
2010  // No proc_bind clause specified; use current proc-bind-var for this
2011  // parallel region
2012  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2013  }
2014  /* else: The proc_bind policy was specified explicitly on parallel clause.
2015  This overrides proc-bind-var for this parallel region, but does not
2016  change proc-bind-var. */
2017  // Figure the value of proc-bind-var for the child threads.
2018  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2019  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2020  master_th->th.th_current_task->td_icvs.proc_bind)) {
2021  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2022  }
2023  }
2024 
2025  // Reset for next parallel region
2026  master_th->th.th_set_proc_bind = proc_bind_default;
2027 #endif /* OMP_40_ENABLED */
2028 
2029  if ((nthreads_icv > 0)
2030 #if OMP_40_ENABLED
2031  || (proc_bind_icv != proc_bind_default)
2032 #endif /* OMP_40_ENABLED */
2033  ) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040 
2041 #if OMP_40_ENABLED
2042  if (proc_bind_icv != proc_bind_default) {
2043  new_icvs.proc_bind = proc_bind_icv;
2044  }
2045 #endif /* OMP_40_ENABLED */
2046 
2047  /* allocate a new parallel team */
2048  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2049  team = __kmp_allocate_team(root, nthreads, nthreads,
2050 #if OMPT_SUPPORT
2051  ompt_parallel_data,
2052 #endif
2053 #if OMP_40_ENABLED
2054  proc_bind,
2055 #endif
2056  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2057  } else {
2058  /* allocate a new parallel team */
2059  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2060  team = __kmp_allocate_team(root, nthreads, nthreads,
2061 #if OMPT_SUPPORT
2062  ompt_parallel_data,
2063 #endif
2064 #if OMP_40_ENABLED
2065  proc_bind,
2066 #endif
2067  &master_th->th.th_current_task->td_icvs,
2068  argc USE_NESTED_HOT_ARG(master_th));
2069  }
2070  KF_TRACE(
2071  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2072 
2073  /* setup the new team */
2074  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2075  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2076  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2077  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2078  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2079 #if OMPT_SUPPORT
2080  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2081  return_address);
2082 #endif
2083  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2084 // TODO: parent_team->t.t_level == INT_MAX ???
2085 #if OMP_40_ENABLED
2086  if (!master_th->th.th_teams_microtask || level > teams_level) {
2087 #endif /* OMP_40_ENABLED */
2088  int new_level = parent_team->t.t_level + 1;
2089  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2090  new_level = parent_team->t.t_active_level + 1;
2091  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2092 #if OMP_40_ENABLED
2093  } else {
2094  // AC: Do not increase parallel level at start of the teams construct
2095  int new_level = parent_team->t.t_level;
2096  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2097  new_level = parent_team->t.t_active_level;
2098  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2099  }
2100 #endif /* OMP_40_ENABLED */
2101  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2102  // set master's schedule as new run-time schedule
2103  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2104 
2105 #if OMP_40_ENABLED
2106  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2107 #endif
2108 #if OMP_50_ENABLED
2109  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2110 #endif
2111 
2112  // Update the floating point rounding in the team if required.
2113  propagateFPControl(team);
2114 
2115  if (__kmp_tasking_mode != tskm_immediate_exec) {
2116  // Set master's task team to team's task team. Unless this is hot team, it
2117  // should be NULL.
2118  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2119  parent_team->t.t_task_team[master_th->th.th_task_state]);
2120  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2121  "%p, new task_team %p / team %p\n",
2122  __kmp_gtid_from_thread(master_th),
2123  master_th->th.th_task_team, parent_team,
2124  team->t.t_task_team[master_th->th.th_task_state], team));
2125 
2126  if (active_level || master_th->th.th_task_team) {
2127  // Take a memo of master's task_state
2128  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2129  if (master_th->th.th_task_state_top >=
2130  master_th->th.th_task_state_stack_sz) { // increase size
2131  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2132  kmp_uint8 *old_stack, *new_stack;
2133  kmp_uint32 i;
2134  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2135  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2136  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2137  }
2138  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2139  ++i) { // zero-init rest of stack
2140  new_stack[i] = 0;
2141  }
2142  old_stack = master_th->th.th_task_state_memo_stack;
2143  master_th->th.th_task_state_memo_stack = new_stack;
2144  master_th->th.th_task_state_stack_sz = new_size;
2145  __kmp_free(old_stack);
2146  }
2147  // Store master's task_state on stack
2148  master_th->th
2149  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2150  master_th->th.th_task_state;
2151  master_th->th.th_task_state_top++;
2152 #if KMP_NESTED_HOT_TEAMS
2153  if (master_th->th.th_hot_teams &&
2154  active_level < __kmp_hot_teams_max_level &&
2155  team == master_th->th.th_hot_teams[active_level].hot_team) {
2156  // Restore master's nested state if nested hot team
2157  master_th->th.th_task_state =
2158  master_th->th
2159  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2160  } else {
2161 #endif
2162  master_th->th.th_task_state = 0;
2163 #if KMP_NESTED_HOT_TEAMS
2164  }
2165 #endif
2166  }
2167 #if !KMP_NESTED_HOT_TEAMS
2168  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2169  (team == root->r.r_hot_team));
2170 #endif
2171  }
2172 
2173  KA_TRACE(
2174  20,
2175  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2176  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2177  team->t.t_nproc));
2178  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2179  (team->t.t_master_tid == 0 &&
2180  (team->t.t_parent == root->r.r_root_team ||
2181  team->t.t_parent->t.t_serialized)));
2182  KMP_MB();
2183 
2184  /* now, setup the arguments */
2185  argv = (void **)team->t.t_argv;
2186 #if OMP_40_ENABLED
2187  if (ap) {
2188 #endif /* OMP_40_ENABLED */
2189  for (i = argc - 1; i >= 0; --i) {
2190 // TODO: revert workaround for Intel(R) 64 tracker #96
2191 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2192  void *new_argv = va_arg(*ap, void *);
2193 #else
2194  void *new_argv = va_arg(ap, void *);
2195 #endif
2196  KMP_CHECK_UPDATE(*argv, new_argv);
2197  argv++;
2198  }
2199 #if OMP_40_ENABLED
2200  } else {
2201  for (i = 0; i < argc; ++i) {
2202  // Get args from parent team for teams construct
2203  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2204  }
2205  }
2206 #endif /* OMP_40_ENABLED */
2207 
2208  /* now actually fork the threads */
2209  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2210  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2211  root->r.r_active = TRUE;
2212 
2213  __kmp_fork_team_threads(root, team, master_th, gtid);
2214  __kmp_setup_icv_copy(team, nthreads,
2215  &master_th->th.th_current_task->td_icvs, loc);
2216 
2217 #if OMPT_SUPPORT
2218  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2219 #endif
2220 
2221  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2222 
2223 #if USE_ITT_BUILD
2224  if (team->t.t_active_level == 1 // only report frames at level 1
2225 #if OMP_40_ENABLED
2226  && !master_th->th.th_teams_microtask // not in teams construct
2227 #endif /* OMP_40_ENABLED */
2228  ) {
2229 #if USE_ITT_NOTIFY
2230  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2231  (__kmp_forkjoin_frames_mode == 3 ||
2232  __kmp_forkjoin_frames_mode == 1)) {
2233  kmp_uint64 tmp_time = 0;
2234  if (__itt_get_timestamp_ptr)
2235  tmp_time = __itt_get_timestamp();
2236  // Internal fork - report frame begin
2237  master_th->th.th_frame_time = tmp_time;
2238  if (__kmp_forkjoin_frames_mode == 3)
2239  team->t.t_region_time = tmp_time;
2240  } else
2241 // only one notification scheme (either "submit" or "forking/joined", not both)
2242 #endif /* USE_ITT_NOTIFY */
2243  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2244  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2245  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2246  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2247  }
2248  }
2249 #endif /* USE_ITT_BUILD */
2250 
2251  /* now go on and do the work */
2252  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2253  KMP_MB();
2254  KF_TRACE(10,
2255  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2256  root, team, master_th, gtid));
2257 
2258 #if USE_ITT_BUILD
2259  if (__itt_stack_caller_create_ptr) {
2260  team->t.t_stack_id =
2261  __kmp_itt_stack_caller_create(); // create new stack stitching id
2262  // before entering fork barrier
2263  }
2264 #endif /* USE_ITT_BUILD */
2265 
2266 #if OMP_40_ENABLED
2267  // AC: skip __kmp_internal_fork at teams construct, let only master
2268  // threads execute
2269  if (ap)
2270 #endif /* OMP_40_ENABLED */
2271  {
2272  __kmp_internal_fork(loc, gtid, team);
2273  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2274  "master_th=%p, gtid=%d\n",
2275  root, team, master_th, gtid));
2276  }
2277 
2278  if (call_context == fork_context_gnu) {
2279  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2280  return TRUE;
2281  }
2282 
2283  /* Invoke microtask for MASTER thread */
2284  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2285  team->t.t_id, team->t.t_pkfn));
2286  } // END of timer KMP_fork_call block
2287 
2288 #if KMP_STATS_ENABLED && OMP_40_ENABLED
2289  // If beginning a teams construct, then change thread state
2290  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2291  if (!ap) {
2292  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2293  }
2294 #endif
2295 
2296  if (!team->t.t_invoke(gtid)) {
2297  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2298  }
2299 
2300 #if KMP_STATS_ENABLED && OMP_40_ENABLED
2301  // If was beginning of a teams construct, then reset thread state
2302  if (!ap) {
2303  KMP_SET_THREAD_STATE(previous_state);
2304  }
2305 #endif
2306 
2307  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2308  team->t.t_id, team->t.t_pkfn));
2309  KMP_MB(); /* Flush all pending memory write invalidates. */
2310 
2311  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2312 
2313 #if OMPT_SUPPORT
2314  if (ompt_enabled.enabled) {
2315  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2316  }
2317 #endif
2318 
2319  return TRUE;
2320 }
2321 
2322 #if OMPT_SUPPORT
2323 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2324  kmp_team_t *team) {
2325  // restore state outside the region
2326  thread->th.ompt_thread_info.state =
2327  ((team->t.t_serialized) ? ompt_state_work_serial
2328  : ompt_state_work_parallel);
2329 }
2330 
2331 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2332  kmp_team_t *team, ompt_data_t *parallel_data,
2333  fork_context_e fork_context, void *codeptr) {
2334  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2335  if (ompt_enabled.ompt_callback_parallel_end) {
2336  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2337  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2338  codeptr);
2339  }
2340 
2341  task_info->frame.enter_frame = ompt_data_none;
2342  __kmp_join_restore_state(thread, team);
2343 }
2344 #endif
2345 
2346 void __kmp_join_call(ident_t *loc, int gtid
2347 #if OMPT_SUPPORT
2348  ,
2349  enum fork_context_e fork_context
2350 #endif
2351 #if OMP_40_ENABLED
2352  ,
2353  int exit_teams
2354 #endif /* OMP_40_ENABLED */
2355  ) {
2356  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2357  kmp_team_t *team;
2358  kmp_team_t *parent_team;
2359  kmp_info_t *master_th;
2360  kmp_root_t *root;
2361  int master_active;
2362 
2363  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2364 
2365  /* setup current data */
2366  master_th = __kmp_threads[gtid];
2367  root = master_th->th.th_root;
2368  team = master_th->th.th_team;
2369  parent_team = team->t.t_parent;
2370 
2371  master_th->th.th_ident = loc;
2372 
2373 #if OMPT_SUPPORT
2374  if (ompt_enabled.enabled) {
2375  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2376  }
2377 #endif
2378 
2379 #if KMP_DEBUG
2380  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2381  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2382  "th_task_team = %p\n",
2383  __kmp_gtid_from_thread(master_th), team,
2384  team->t.t_task_team[master_th->th.th_task_state],
2385  master_th->th.th_task_team));
2386  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2387  team->t.t_task_team[master_th->th.th_task_state]);
2388  }
2389 #endif
2390 
2391  if (team->t.t_serialized) {
2392 #if OMP_40_ENABLED
2393  if (master_th->th.th_teams_microtask) {
2394  // We are in teams construct
2395  int level = team->t.t_level;
2396  int tlevel = master_th->th.th_teams_level;
2397  if (level == tlevel) {
2398  // AC: we haven't incremented it earlier at start of teams construct,
2399  // so do it here - at the end of teams construct
2400  team->t.t_level++;
2401  } else if (level == tlevel + 1) {
2402  // AC: we are exiting parallel inside teams, need to increment
2403  // serialization in order to restore it in the next call to
2404  // __kmpc_end_serialized_parallel
2405  team->t.t_serialized++;
2406  }
2407  }
2408 #endif /* OMP_40_ENABLED */
2409  __kmpc_end_serialized_parallel(loc, gtid);
2410 
2411 #if OMPT_SUPPORT
2412  if (ompt_enabled.enabled) {
2413  __kmp_join_restore_state(master_th, parent_team);
2414  }
2415 #endif
2416 
2417  return;
2418  }
2419 
2420  master_active = team->t.t_master_active;
2421 
2422 #if OMP_40_ENABLED
2423  if (!exit_teams)
2424 #endif /* OMP_40_ENABLED */
2425  {
2426  // AC: No barrier for internal teams at exit from teams construct.
2427  // But there is barrier for external team (league).
2428  __kmp_internal_join(loc, gtid, team);
2429  }
2430 #if OMP_40_ENABLED
2431  else {
2432  master_th->th.th_task_state =
2433  0; // AC: no tasking in teams (out of any parallel)
2434  }
2435 #endif /* OMP_40_ENABLED */
2436 
2437  KMP_MB();
2438 
2439 #if OMPT_SUPPORT
2440  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2441  void *codeptr = team->t.ompt_team_info.master_return_address;
2442 #endif
2443 
2444 #if USE_ITT_BUILD
2445  if (__itt_stack_caller_create_ptr) {
2446  __kmp_itt_stack_caller_destroy(
2447  (__itt_caller)team->t
2448  .t_stack_id); // destroy the stack stitching id after join barrier
2449  }
2450 
2451  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2452  if (team->t.t_active_level == 1
2453 #if OMP_40_ENABLED
2454  && !master_th->th.th_teams_microtask /* not in teams construct */
2455 #endif /* OMP_40_ENABLED */
2456  ) {
2457  master_th->th.th_ident = loc;
2458  // only one notification scheme (either "submit" or "forking/joined", not
2459  // both)
2460  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2461  __kmp_forkjoin_frames_mode == 3)
2462  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2463  master_th->th.th_frame_time, 0, loc,
2464  master_th->th.th_team_nproc, 1);
2465  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2466  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2467  __kmp_itt_region_joined(gtid);
2468  } // active_level == 1
2469 #endif /* USE_ITT_BUILD */
2470 
2471 #if OMP_40_ENABLED
2472  if (master_th->th.th_teams_microtask && !exit_teams &&
2473  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2474  team->t.t_level == master_th->th.th_teams_level + 1) {
2475  // AC: We need to leave the team structure intact at the end of parallel
2476  // inside the teams construct, so that at the next parallel same (hot) team
2477  // works, only adjust nesting levels
2478 
2479  /* Decrement our nested depth level */
2480  team->t.t_level--;
2481  team->t.t_active_level--;
2482  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2483 
2484  // Restore number of threads in the team if needed. This code relies on
2485  // the proper adjustment of th_teams_size.nth after the fork in
2486  // __kmp_teams_master on each teams master in the case that
2487  // __kmp_reserve_threads reduced it.
2488  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2489  int old_num = master_th->th.th_team_nproc;
2490  int new_num = master_th->th.th_teams_size.nth;
2491  kmp_info_t **other_threads = team->t.t_threads;
2492  team->t.t_nproc = new_num;
2493  for (int i = 0; i < old_num; ++i) {
2494  other_threads[i]->th.th_team_nproc = new_num;
2495  }
2496  // Adjust states of non-used threads of the team
2497  for (int i = old_num; i < new_num; ++i) {
2498  // Re-initialize thread's barrier data.
2499  KMP_DEBUG_ASSERT(other_threads[i]);
2500  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2501  for (int b = 0; b < bs_last_barrier; ++b) {
2502  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2503  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2504 #if USE_DEBUGGER
2505  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2506 #endif
2507  }
2508  if (__kmp_tasking_mode != tskm_immediate_exec) {
2509  // Synchronize thread's task state
2510  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2511  }
2512  }
2513  }
2514 
2515 #if OMPT_SUPPORT
2516  if (ompt_enabled.enabled) {
2517  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2518  codeptr);
2519  }
2520 #endif
2521 
2522  return;
2523  }
2524 #endif /* OMP_40_ENABLED */
2525 
2526  /* do cleanup and restore the parent team */
2527  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2528  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2529 
2530  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2531 
2532  /* jc: The following lock has instructions with REL and ACQ semantics,
2533  separating the parallel user code called in this parallel region
2534  from the serial user code called after this function returns. */
2535  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2536 
2537 #if OMP_40_ENABLED
2538  if (!master_th->th.th_teams_microtask ||
2539  team->t.t_level > master_th->th.th_teams_level)
2540 #endif /* OMP_40_ENABLED */
2541  {
2542  /* Decrement our nested depth level */
2543  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2544  }
2545  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2546 
2547 #if OMPT_SUPPORT
2548  if (ompt_enabled.enabled) {
2549  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2550  if (ompt_enabled.ompt_callback_implicit_task) {
2551  int ompt_team_size = team->t.t_nproc;
2552  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2553  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2554  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2555  }
2556 
2557  task_info->frame.exit_frame = ompt_data_none;
2558  task_info->task_data = ompt_data_none;
2559  }
2560 #endif
2561 
2562  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2563  master_th, team));
2564  __kmp_pop_current_task_from_thread(master_th);
2565 
2566 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2567  // Restore master thread's partition.
2568  master_th->th.th_first_place = team->t.t_first_place;
2569  master_th->th.th_last_place = team->t.t_last_place;
2570 #endif /* OMP_40_ENABLED */
2571 #if OMP_50_ENABLED
2572  master_th->th.th_def_allocator = team->t.t_def_allocator;
2573 #endif
2574 
2575  updateHWFPControl(team);
2576 
2577  if (root->r.r_active != master_active)
2578  root->r.r_active = master_active;
2579 
2580  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2581  master_th)); // this will free worker threads
2582 
2583  /* this race was fun to find. make sure the following is in the critical
2584  region otherwise assertions may fail occasionally since the old team may be
2585  reallocated and the hierarchy appears inconsistent. it is actually safe to
2586  run and won't cause any bugs, but will cause those assertion failures. it's
2587  only one deref&assign so might as well put this in the critical region */
2588  master_th->th.th_team = parent_team;
2589  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2590  master_th->th.th_team_master = parent_team->t.t_threads[0];
2591  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2592 
2593  /* restore serialized team, if need be */
2594  if (parent_team->t.t_serialized &&
2595  parent_team != master_th->th.th_serial_team &&
2596  parent_team != root->r.r_root_team) {
2597  __kmp_free_team(root,
2598  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2599  master_th->th.th_serial_team = parent_team;
2600  }
2601 
2602  if (__kmp_tasking_mode != tskm_immediate_exec) {
2603  if (master_th->th.th_task_state_top >
2604  0) { // Restore task state from memo stack
2605  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2606  // Remember master's state if we re-use this nested hot team
2607  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2608  master_th->th.th_task_state;
2609  --master_th->th.th_task_state_top; // pop
2610  // Now restore state at this level
2611  master_th->th.th_task_state =
2612  master_th->th
2613  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2614  }
2615  // Copy the task team from the parent team to the master thread
2616  master_th->th.th_task_team =
2617  parent_team->t.t_task_team[master_th->th.th_task_state];
2618  KA_TRACE(20,
2619  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2620  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2621  parent_team));
2622  }
2623 
2624  // TODO: GEH - cannot do this assertion because root thread not set up as
2625  // executing
2626  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2627  master_th->th.th_current_task->td_flags.executing = 1;
2628 
2629  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2630 
2631 #if OMPT_SUPPORT
2632  if (ompt_enabled.enabled) {
2633  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2634  codeptr);
2635  }
2636 #endif
2637 
2638  KMP_MB();
2639  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2640 }
2641 
2642 /* Check whether we should push an internal control record onto the
2643  serial team stack. If so, do it. */
2644 void __kmp_save_internal_controls(kmp_info_t *thread) {
2645 
2646  if (thread->th.th_team != thread->th.th_serial_team) {
2647  return;
2648  }
2649  if (thread->th.th_team->t.t_serialized > 1) {
2650  int push = 0;
2651 
2652  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2653  push = 1;
2654  } else {
2655  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2656  thread->th.th_team->t.t_serialized) {
2657  push = 1;
2658  }
2659  }
2660  if (push) { /* push a record on the serial team's stack */
2661  kmp_internal_control_t *control =
2662  (kmp_internal_control_t *)__kmp_allocate(
2663  sizeof(kmp_internal_control_t));
2664 
2665  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2666 
2667  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2668 
2669  control->next = thread->th.th_team->t.t_control_stack_top;
2670  thread->th.th_team->t.t_control_stack_top = control;
2671  }
2672  }
2673 }
2674 
2675 /* Changes set_nproc */
2676 void __kmp_set_num_threads(int new_nth, int gtid) {
2677  kmp_info_t *thread;
2678  kmp_root_t *root;
2679 
2680  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2681  KMP_DEBUG_ASSERT(__kmp_init_serial);
2682 
2683  if (new_nth < 1)
2684  new_nth = 1;
2685  else if (new_nth > __kmp_max_nth)
2686  new_nth = __kmp_max_nth;
2687 
2688  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2689  thread = __kmp_threads[gtid];
2690  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2691  return; // nothing to do
2692 
2693  __kmp_save_internal_controls(thread);
2694 
2695  set__nproc(thread, new_nth);
2696 
2697  // If this omp_set_num_threads() call will cause the hot team size to be
2698  // reduced (in the absence of a num_threads clause), then reduce it now,
2699  // rather than waiting for the next parallel region.
2700  root = thread->th.th_root;
2701  if (__kmp_init_parallel && (!root->r.r_active) &&
2702  (root->r.r_hot_team->t.t_nproc > new_nth)
2703 #if KMP_NESTED_HOT_TEAMS
2704  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2705 #endif
2706  ) {
2707  kmp_team_t *hot_team = root->r.r_hot_team;
2708  int f;
2709 
2710  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2711 
2712  // Release the extra threads we don't need any more.
2713  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2714  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2715  if (__kmp_tasking_mode != tskm_immediate_exec) {
2716  // When decreasing team size, threads no longer in the team should unref
2717  // task team.
2718  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2719  }
2720  __kmp_free_thread(hot_team->t.t_threads[f]);
2721  hot_team->t.t_threads[f] = NULL;
2722  }
2723  hot_team->t.t_nproc = new_nth;
2724 #if KMP_NESTED_HOT_TEAMS
2725  if (thread->th.th_hot_teams) {
2726  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2727  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2728  }
2729 #endif
2730 
2731  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732 
2733  // Update the t_nproc field in the threads that are still active.
2734  for (f = 0; f < new_nth; f++) {
2735  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2736  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2737  }
2738  // Special flag in case omp_set_num_threads() call
2739  hot_team->t.t_size_changed = -1;
2740  }
2741 }
2742 
2743 /* Changes max_active_levels */
2744 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2745  kmp_info_t *thread;
2746 
2747  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2748  "%d = (%d)\n",
2749  gtid, max_active_levels));
2750  KMP_DEBUG_ASSERT(__kmp_init_serial);
2751 
2752  // validate max_active_levels
2753  if (max_active_levels < 0) {
2754  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2755  // We ignore this call if the user has specified a negative value.
2756  // The current setting won't be changed. The last valid setting will be
2757  // used. A warning will be issued (if warnings are allowed as controlled by
2758  // the KMP_WARNINGS env var).
2759  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2760  "max_active_levels for thread %d = (%d)\n",
2761  gtid, max_active_levels));
2762  return;
2763  }
2764  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2765  // it's OK, the max_active_levels is within the valid range: [ 0;
2766  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2767  // We allow a zero value. (implementation defined behavior)
2768  } else {
2769  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2770  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2771  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2772  // Current upper limit is MAX_INT. (implementation defined behavior)
2773  // If the input exceeds the upper limit, we correct the input to be the
2774  // upper limit. (implementation defined behavior)
2775  // Actually, the flow should never get here until we use MAX_INT limit.
2776  }
2777  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2778  "max_active_levels for thread %d = (%d)\n",
2779  gtid, max_active_levels));
2780 
2781  thread = __kmp_threads[gtid];
2782 
2783  __kmp_save_internal_controls(thread);
2784 
2785  set__max_active_levels(thread, max_active_levels);
2786 }
2787 
2788 /* Gets max_active_levels */
2789 int __kmp_get_max_active_levels(int gtid) {
2790  kmp_info_t *thread;
2791 
2792  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2793  KMP_DEBUG_ASSERT(__kmp_init_serial);
2794 
2795  thread = __kmp_threads[gtid];
2796  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2797  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2798  "curtask_maxaclevel=%d\n",
2799  gtid, thread->th.th_current_task,
2800  thread->th.th_current_task->td_icvs.max_active_levels));
2801  return thread->th.th_current_task->td_icvs.max_active_levels;
2802 }
2803 
2804 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2805 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2806 
2807 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2808 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2809  kmp_info_t *thread;
2810  kmp_sched_t orig_kind;
2811  // kmp_team_t *team;
2812 
2813  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2814  gtid, (int)kind, chunk));
2815  KMP_DEBUG_ASSERT(__kmp_init_serial);
2816 
2817  // Check if the kind parameter is valid, correct if needed.
2818  // Valid parameters should fit in one of two intervals - standard or extended:
2819  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2820  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2821  orig_kind = kind;
2822  kind = __kmp_sched_without_mods(kind);
2823 
2824  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2825  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2826  // TODO: Hint needs attention in case we change the default schedule.
2827  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2828  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2829  __kmp_msg_null);
2830  kind = kmp_sched_default;
2831  chunk = 0; // ignore chunk value in case of bad kind
2832  }
2833 
2834  thread = __kmp_threads[gtid];
2835 
2836  __kmp_save_internal_controls(thread);
2837 
2838  if (kind < kmp_sched_upper_std) {
2839  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2840  // differ static chunked vs. unchunked: chunk should be invalid to
2841  // indicate unchunked schedule (which is the default)
2842  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2843  } else {
2844  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2845  __kmp_sch_map[kind - kmp_sched_lower - 1];
2846  }
2847  } else {
2848  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2849  // kmp_sched_lower - 2 ];
2850  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2851  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2852  kmp_sched_lower - 2];
2853  }
2854  __kmp_sched_apply_mods_intkind(
2855  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2856  if (kind == kmp_sched_auto || chunk < 1) {
2857  // ignore parameter chunk for schedule auto
2858  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2859  } else {
2860  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2861  }
2862 }
2863 
2864 /* Gets def_sched_var ICV values */
2865 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2866  kmp_info_t *thread;
2867  enum sched_type th_type;
2868 
2869  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2870  KMP_DEBUG_ASSERT(__kmp_init_serial);
2871 
2872  thread = __kmp_threads[gtid];
2873 
2874  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2875  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2876  case kmp_sch_static:
2877  case kmp_sch_static_greedy:
2878  case kmp_sch_static_balanced:
2879  *kind = kmp_sched_static;
2880  __kmp_sched_apply_mods_stdkind(kind, th_type);
2881  *chunk = 0; // chunk was not set, try to show this fact via zero value
2882  return;
2883  case kmp_sch_static_chunked:
2884  *kind = kmp_sched_static;
2885  break;
2886  case kmp_sch_dynamic_chunked:
2887  *kind = kmp_sched_dynamic;
2888  break;
2890  case kmp_sch_guided_iterative_chunked:
2891  case kmp_sch_guided_analytical_chunked:
2892  *kind = kmp_sched_guided;
2893  break;
2894  case kmp_sch_auto:
2895  *kind = kmp_sched_auto;
2896  break;
2897  case kmp_sch_trapezoidal:
2898  *kind = kmp_sched_trapezoidal;
2899  break;
2900 #if KMP_STATIC_STEAL_ENABLED
2901  case kmp_sch_static_steal:
2902  *kind = kmp_sched_static_steal;
2903  break;
2904 #endif
2905  default:
2906  KMP_FATAL(UnknownSchedulingType, th_type);
2907  }
2908 
2909  __kmp_sched_apply_mods_stdkind(kind, th_type);
2910  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2911 }
2912 
2913 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2914 
2915  int ii, dd;
2916  kmp_team_t *team;
2917  kmp_info_t *thr;
2918 
2919  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2920  KMP_DEBUG_ASSERT(__kmp_init_serial);
2921 
2922  // validate level
2923  if (level == 0)
2924  return 0;
2925  if (level < 0)
2926  return -1;
2927  thr = __kmp_threads[gtid];
2928  team = thr->th.th_team;
2929  ii = team->t.t_level;
2930  if (level > ii)
2931  return -1;
2932 
2933 #if OMP_40_ENABLED
2934  if (thr->th.th_teams_microtask) {
2935  // AC: we are in teams region where multiple nested teams have same level
2936  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2937  if (level <=
2938  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2939  KMP_DEBUG_ASSERT(ii >= tlevel);
2940  // AC: As we need to pass by the teams league, we need to artificially
2941  // increase ii
2942  if (ii == tlevel) {
2943  ii += 2; // three teams have same level
2944  } else {
2945  ii++; // two teams have same level
2946  }
2947  }
2948  }
2949 #endif
2950 
2951  if (ii == level)
2952  return __kmp_tid_from_gtid(gtid);
2953 
2954  dd = team->t.t_serialized;
2955  level++;
2956  while (ii > level) {
2957  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2958  }
2959  if ((team->t.t_serialized) && (!dd)) {
2960  team = team->t.t_parent;
2961  continue;
2962  }
2963  if (ii > level) {
2964  team = team->t.t_parent;
2965  dd = team->t.t_serialized;
2966  ii--;
2967  }
2968  }
2969 
2970  return (dd > 1) ? (0) : (team->t.t_master_tid);
2971 }
2972 
2973 int __kmp_get_team_size(int gtid, int level) {
2974 
2975  int ii, dd;
2976  kmp_team_t *team;
2977  kmp_info_t *thr;
2978 
2979  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2980  KMP_DEBUG_ASSERT(__kmp_init_serial);
2981 
2982  // validate level
2983  if (level == 0)
2984  return 1;
2985  if (level < 0)
2986  return -1;
2987  thr = __kmp_threads[gtid];
2988  team = thr->th.th_team;
2989  ii = team->t.t_level;
2990  if (level > ii)
2991  return -1;
2992 
2993 #if OMP_40_ENABLED
2994  if (thr->th.th_teams_microtask) {
2995  // AC: we are in teams region where multiple nested teams have same level
2996  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2997  if (level <=
2998  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2999  KMP_DEBUG_ASSERT(ii >= tlevel);
3000  // AC: As we need to pass by the teams league, we need to artificially
3001  // increase ii
3002  if (ii == tlevel) {
3003  ii += 2; // three teams have same level
3004  } else {
3005  ii++; // two teams have same level
3006  }
3007  }
3008  }
3009 #endif
3010 
3011  while (ii > level) {
3012  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3013  }
3014  if (team->t.t_serialized && (!dd)) {
3015  team = team->t.t_parent;
3016  continue;
3017  }
3018  if (ii > level) {
3019  team = team->t.t_parent;
3020  ii--;
3021  }
3022  }
3023 
3024  return team->t.t_nproc;
3025 }
3026 
3027 kmp_r_sched_t __kmp_get_schedule_global() {
3028  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3029  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3030  // independently. So one can get the updated schedule here.
3031 
3032  kmp_r_sched_t r_sched;
3033 
3034  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3035  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3036  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3037  // different roots (even in OMP 2.5)
3038  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3039 #if OMP_45_ENABLED
3040  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3041 #endif
3042  if (s == kmp_sch_static) {
3043  // replace STATIC with more detailed schedule (balanced or greedy)
3044  r_sched.r_sched_type = __kmp_static;
3045  } else if (s == kmp_sch_guided_chunked) {
3046  // replace GUIDED with more detailed schedule (iterative or analytical)
3047  r_sched.r_sched_type = __kmp_guided;
3048  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3049  r_sched.r_sched_type = __kmp_sched;
3050  }
3051 #if OMP_45_ENABLED
3052  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3053 #endif
3054 
3055  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3056  // __kmp_chunk may be wrong here (if it was not ever set)
3057  r_sched.chunk = KMP_DEFAULT_CHUNK;
3058  } else {
3059  r_sched.chunk = __kmp_chunk;
3060  }
3061 
3062  return r_sched;
3063 }
3064 
3065 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3066  at least argc number of *t_argv entries for the requested team. */
3067 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3068 
3069  KMP_DEBUG_ASSERT(team);
3070  if (!realloc || argc > team->t.t_max_argc) {
3071 
3072  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3073  "current entries=%d\n",
3074  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3075  /* if previously allocated heap space for args, free them */
3076  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3077  __kmp_free((void *)team->t.t_argv);
3078 
3079  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3080  /* use unused space in the cache line for arguments */
3081  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3082  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3083  "argv entries\n",
3084  team->t.t_id, team->t.t_max_argc));
3085  team->t.t_argv = &team->t.t_inline_argv[0];
3086  if (__kmp_storage_map) {
3087  __kmp_print_storage_map_gtid(
3088  -1, &team->t.t_inline_argv[0],
3089  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3090  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3091  team->t.t_id);
3092  }
3093  } else {
3094  /* allocate space for arguments in the heap */
3095  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3096  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3097  : 2 * argc;
3098  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3099  "argv entries\n",
3100  team->t.t_id, team->t.t_max_argc));
3101  team->t.t_argv =
3102  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3103  if (__kmp_storage_map) {
3104  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3105  &team->t.t_argv[team->t.t_max_argc],
3106  sizeof(void *) * team->t.t_max_argc,
3107  "team_%d.t_argv", team->t.t_id);
3108  }
3109  }
3110  }
3111 }
3112 
3113 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3114  int i;
3115  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3116  team->t.t_threads =
3117  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3118  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3119  sizeof(dispatch_shared_info_t) * num_disp_buff);
3120  team->t.t_dispatch =
3121  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3122  team->t.t_implicit_task_taskdata =
3123  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3124  team->t.t_max_nproc = max_nth;
3125 
3126  /* setup dispatch buffers */
3127  for (i = 0; i < num_disp_buff; ++i) {
3128  team->t.t_disp_buffer[i].buffer_index = i;
3129 #if OMP_45_ENABLED
3130  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3131 #endif
3132  }
3133 }
3134 
3135 static void __kmp_free_team_arrays(kmp_team_t *team) {
3136  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3137  int i;
3138  for (i = 0; i < team->t.t_max_nproc; ++i) {
3139  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3140  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3141  team->t.t_dispatch[i].th_disp_buffer = NULL;
3142  }
3143  }
3144 #if KMP_USE_HIER_SCHED
3145  __kmp_dispatch_free_hierarchies(team);
3146 #endif
3147  __kmp_free(team->t.t_threads);
3148  __kmp_free(team->t.t_disp_buffer);
3149  __kmp_free(team->t.t_dispatch);
3150  __kmp_free(team->t.t_implicit_task_taskdata);
3151  team->t.t_threads = NULL;
3152  team->t.t_disp_buffer = NULL;
3153  team->t.t_dispatch = NULL;
3154  team->t.t_implicit_task_taskdata = 0;
3155 }
3156 
3157 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3158  kmp_info_t **oldThreads = team->t.t_threads;
3159 
3160  __kmp_free(team->t.t_disp_buffer);
3161  __kmp_free(team->t.t_dispatch);
3162  __kmp_free(team->t.t_implicit_task_taskdata);
3163  __kmp_allocate_team_arrays(team, max_nth);
3164 
3165  KMP_MEMCPY(team->t.t_threads, oldThreads,
3166  team->t.t_nproc * sizeof(kmp_info_t *));
3167 
3168  __kmp_free(oldThreads);
3169 }
3170 
3171 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3172 
3173  kmp_r_sched_t r_sched =
3174  __kmp_get_schedule_global(); // get current state of scheduling globals
3175 
3176 #if OMP_40_ENABLED
3177  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3178 #endif /* OMP_40_ENABLED */
3179 
3180  kmp_internal_control_t g_icvs = {
3181  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3182  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3183  // adjustment of threads (per thread)
3184  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3185  // whether blocktime is explicitly set
3186  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3187 #if KMP_USE_MONITOR
3188  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3189 // intervals
3190 #endif
3191  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3192  // next parallel region (per thread)
3193  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3194  __kmp_cg_max_nth, // int thread_limit;
3195  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3196  // for max_active_levels
3197  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3198 // {sched,chunk} pair
3199 #if OMP_40_ENABLED
3200  __kmp_nested_proc_bind.bind_types[0],
3201  __kmp_default_device,
3202 #endif /* OMP_40_ENABLED */
3203  NULL // struct kmp_internal_control *next;
3204  };
3205 
3206  return g_icvs;
3207 }
3208 
3209 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3210 
3211  kmp_internal_control_t gx_icvs;
3212  gx_icvs.serial_nesting_level =
3213  0; // probably =team->t.t_serial like in save_inter_controls
3214  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3215  gx_icvs.next = NULL;
3216 
3217  return gx_icvs;
3218 }
3219 
3220 static void __kmp_initialize_root(kmp_root_t *root) {
3221  int f;
3222  kmp_team_t *root_team;
3223  kmp_team_t *hot_team;
3224  int hot_team_max_nth;
3225  kmp_r_sched_t r_sched =
3226  __kmp_get_schedule_global(); // get current state of scheduling globals
3227  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3228  KMP_DEBUG_ASSERT(root);
3229  KMP_ASSERT(!root->r.r_begin);
3230 
3231  /* setup the root state structure */
3232  __kmp_init_lock(&root->r.r_begin_lock);
3233  root->r.r_begin = FALSE;
3234  root->r.r_active = FALSE;
3235  root->r.r_in_parallel = 0;
3236  root->r.r_blocktime = __kmp_dflt_blocktime;
3237 
3238  /* setup the root team for this task */
3239  /* allocate the root team structure */
3240  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3241 
3242  root_team =
3243  __kmp_allocate_team(root,
3244  1, // new_nproc
3245  1, // max_nproc
3246 #if OMPT_SUPPORT
3247  ompt_data_none, // root parallel id
3248 #endif
3249 #if OMP_40_ENABLED
3250  __kmp_nested_proc_bind.bind_types[0],
3251 #endif
3252  &r_icvs,
3253  0 // argc
3254  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3255  );
3256 #if USE_DEBUGGER
3257  // Non-NULL value should be assigned to make the debugger display the root
3258  // team.
3259  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3260 #endif
3261 
3262  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3263 
3264  root->r.r_root_team = root_team;
3265  root_team->t.t_control_stack_top = NULL;
3266 
3267  /* initialize root team */
3268  root_team->t.t_threads[0] = NULL;
3269  root_team->t.t_nproc = 1;
3270  root_team->t.t_serialized = 1;
3271  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3272  root_team->t.t_sched.sched = r_sched.sched;
3273  KA_TRACE(
3274  20,
3275  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3276  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3277 
3278  /* setup the hot team for this task */
3279  /* allocate the hot team structure */
3280  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3281 
3282  hot_team =
3283  __kmp_allocate_team(root,
3284  1, // new_nproc
3285  __kmp_dflt_team_nth_ub * 2, // max_nproc
3286 #if OMPT_SUPPORT
3287  ompt_data_none, // root parallel id
3288 #endif
3289 #if OMP_40_ENABLED
3290  __kmp_nested_proc_bind.bind_types[0],
3291 #endif
3292  &r_icvs,
3293  0 // argc
3294  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3295  );
3296  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3297 
3298  root->r.r_hot_team = hot_team;
3299  root_team->t.t_control_stack_top = NULL;
3300 
3301  /* first-time initialization */
3302  hot_team->t.t_parent = root_team;
3303 
3304  /* initialize hot team */
3305  hot_team_max_nth = hot_team->t.t_max_nproc;
3306  for (f = 0; f < hot_team_max_nth; ++f) {
3307  hot_team->t.t_threads[f] = NULL;
3308  }
3309  hot_team->t.t_nproc = 1;
3310  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3311  hot_team->t.t_sched.sched = r_sched.sched;
3312  hot_team->t.t_size_changed = 0;
3313 }
3314 
3315 #ifdef KMP_DEBUG
3316 
3317 typedef struct kmp_team_list_item {
3318  kmp_team_p const *entry;
3319  struct kmp_team_list_item *next;
3320 } kmp_team_list_item_t;
3321 typedef kmp_team_list_item_t *kmp_team_list_t;
3322 
3323 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3324  kmp_team_list_t list, // List of teams.
3325  kmp_team_p const *team // Team to add.
3326  ) {
3327 
3328  // List must terminate with item where both entry and next are NULL.
3329  // Team is added to the list only once.
3330  // List is sorted in ascending order by team id.
3331  // Team id is *not* a key.
3332 
3333  kmp_team_list_t l;
3334 
3335  KMP_DEBUG_ASSERT(list != NULL);
3336  if (team == NULL) {
3337  return;
3338  }
3339 
3340  __kmp_print_structure_team_accum(list, team->t.t_parent);
3341  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3342 
3343  // Search list for the team.
3344  l = list;
3345  while (l->next != NULL && l->entry != team) {
3346  l = l->next;
3347  }
3348  if (l->next != NULL) {
3349  return; // Team has been added before, exit.
3350  }
3351 
3352  // Team is not found. Search list again for insertion point.
3353  l = list;
3354  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3355  l = l->next;
3356  }
3357 
3358  // Insert team.
3359  {
3360  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3361  sizeof(kmp_team_list_item_t));
3362  *item = *l;
3363  l->entry = team;
3364  l->next = item;
3365  }
3366 }
3367 
3368 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3369 
3370  ) {
3371  __kmp_printf("%s", title);
3372  if (team != NULL) {
3373  __kmp_printf("%2x %p\n", team->t.t_id, team);
3374  } else {
3375  __kmp_printf(" - (nil)\n");
3376  }
3377 }
3378 
3379 static void __kmp_print_structure_thread(char const *title,
3380  kmp_info_p const *thread) {
3381  __kmp_printf("%s", title);
3382  if (thread != NULL) {
3383  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3384  } else {
3385  __kmp_printf(" - (nil)\n");
3386  }
3387 }
3388 
3389 void __kmp_print_structure(void) {
3390 
3391  kmp_team_list_t list;
3392 
3393  // Initialize list of teams.
3394  list =
3395  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3396  list->entry = NULL;
3397  list->next = NULL;
3398 
3399  __kmp_printf("\n------------------------------\nGlobal Thread "
3400  "Table\n------------------------------\n");
3401  {
3402  int gtid;
3403  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3404  __kmp_printf("%2d", gtid);
3405  if (__kmp_threads != NULL) {
3406  __kmp_printf(" %p", __kmp_threads[gtid]);
3407  }
3408  if (__kmp_root != NULL) {
3409  __kmp_printf(" %p", __kmp_root[gtid]);
3410  }
3411  __kmp_printf("\n");
3412  }
3413  }
3414 
3415  // Print out __kmp_threads array.
3416  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3417  "----------\n");
3418  if (__kmp_threads != NULL) {
3419  int gtid;
3420  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3421  kmp_info_t const *thread = __kmp_threads[gtid];
3422  if (thread != NULL) {
3423  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3424  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3425  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3426  __kmp_print_structure_team(" Serial Team: ",
3427  thread->th.th_serial_team);
3428  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3429  __kmp_print_structure_thread(" Master: ",
3430  thread->th.th_team_master);
3431  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3432  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3433 #if OMP_40_ENABLED
3434  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3435 #endif
3436  __kmp_print_structure_thread(" Next in pool: ",
3437  thread->th.th_next_pool);
3438  __kmp_printf("\n");
3439  __kmp_print_structure_team_accum(list, thread->th.th_team);
3440  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3441  }
3442  }
3443  } else {
3444  __kmp_printf("Threads array is not allocated.\n");
3445  }
3446 
3447  // Print out __kmp_root array.
3448  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3449  "--------\n");
3450  if (__kmp_root != NULL) {
3451  int gtid;
3452  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3453  kmp_root_t const *root = __kmp_root[gtid];
3454  if (root != NULL) {
3455  __kmp_printf("GTID %2d %p:\n", gtid, root);
3456  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3457  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3458  __kmp_print_structure_thread(" Uber Thread: ",
3459  root->r.r_uber_thread);
3460  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3461  __kmp_printf(" In Parallel: %2d\n",
3462  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3463  __kmp_printf("\n");
3464  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3465  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3466  }
3467  }
3468  } else {
3469  __kmp_printf("Ubers array is not allocated.\n");
3470  }
3471 
3472  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3473  "--------\n");
3474  while (list->next != NULL) {
3475  kmp_team_p const *team = list->entry;
3476  int i;
3477  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3478  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3479  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3480  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3481  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3482  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3483  for (i = 0; i < team->t.t_nproc; ++i) {
3484  __kmp_printf(" Thread %2d: ", i);
3485  __kmp_print_structure_thread("", team->t.t_threads[i]);
3486  }
3487  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3488  __kmp_printf("\n");
3489  list = list->next;
3490  }
3491 
3492  // Print out __kmp_thread_pool and __kmp_team_pool.
3493  __kmp_printf("\n------------------------------\nPools\n----------------------"
3494  "--------\n");
3495  __kmp_print_structure_thread("Thread pool: ",
3496  CCAST(kmp_info_t *, __kmp_thread_pool));
3497  __kmp_print_structure_team("Team pool: ",
3498  CCAST(kmp_team_t *, __kmp_team_pool));
3499  __kmp_printf("\n");
3500 
3501  // Free team list.
3502  while (list != NULL) {
3503  kmp_team_list_item_t *item = list;
3504  list = list->next;
3505  KMP_INTERNAL_FREE(item);
3506  }
3507 }
3508 
3509 #endif
3510 
3511 //---------------------------------------------------------------------------
3512 // Stuff for per-thread fast random number generator
3513 // Table of primes
3514 static const unsigned __kmp_primes[] = {
3515  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3516  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3517  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3518  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3519  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3520  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3521  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3522  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3523  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3524  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3525  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3526 
3527 //---------------------------------------------------------------------------
3528 // __kmp_get_random: Get a random number using a linear congruential method.
3529 unsigned short __kmp_get_random(kmp_info_t *thread) {
3530  unsigned x = thread->th.th_x;
3531  unsigned short r = x >> 16;
3532 
3533  thread->th.th_x = x * thread->th.th_a + 1;
3534 
3535  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3536  thread->th.th_info.ds.ds_tid, r));
3537 
3538  return r;
3539 }
3540 //--------------------------------------------------------
3541 // __kmp_init_random: Initialize a random number generator
3542 void __kmp_init_random(kmp_info_t *thread) {
3543  unsigned seed = thread->th.th_info.ds.ds_tid;
3544 
3545  thread->th.th_a =
3546  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3547  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3548  KA_TRACE(30,
3549  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3550 }
3551 
3552 #if KMP_OS_WINDOWS
3553 /* reclaim array entries for root threads that are already dead, returns number
3554  * reclaimed */
3555 static int __kmp_reclaim_dead_roots(void) {
3556  int i, r = 0;
3557 
3558  for (i = 0; i < __kmp_threads_capacity; ++i) {
3559  if (KMP_UBER_GTID(i) &&
3560  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3561  !__kmp_root[i]
3562  ->r.r_active) { // AC: reclaim only roots died in non-active state
3563  r += __kmp_unregister_root_other_thread(i);
3564  }
3565  }
3566  return r;
3567 }
3568 #endif
3569 
3570 /* This function attempts to create free entries in __kmp_threads and
3571  __kmp_root, and returns the number of free entries generated.
3572 
3573  For Windows* OS static library, the first mechanism used is to reclaim array
3574  entries for root threads that are already dead.
3575 
3576  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3577  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3578  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3579  threadprivate cache array has been created. Synchronization with
3580  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3581 
3582  After any dead root reclamation, if the clipping value allows array expansion
3583  to result in the generation of a total of nNeed free slots, the function does
3584  that expansion. If not, nothing is done beyond the possible initial root
3585  thread reclamation.
3586 
3587  If any argument is negative, the behavior is undefined. */
3588 static int __kmp_expand_threads(int nNeed) {
3589  int added = 0;
3590  int minimumRequiredCapacity;
3591  int newCapacity;
3592  kmp_info_t **newThreads;
3593  kmp_root_t **newRoot;
3594 
3595 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3596 // resizing __kmp_threads does not need additional protection if foreign
3597 // threads are present
3598 
3599 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3600  /* only for Windows static library */
3601  /* reclaim array entries for root threads that are already dead */
3602  added = __kmp_reclaim_dead_roots();
3603 
3604  if (nNeed) {
3605  nNeed -= added;
3606  if (nNeed < 0)
3607  nNeed = 0;
3608  }
3609 #endif
3610  if (nNeed <= 0)
3611  return added;
3612 
3613  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3614  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3615  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3616  // > __kmp_max_nth in one of two ways:
3617  //
3618  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3619  // may not be resused by another thread, so we may need to increase
3620  // __kmp_threads_capacity to __kmp_max_nth + 1.
3621  //
3622  // 2) New foreign root(s) are encountered. We always register new foreign
3623  // roots. This may cause a smaller # of threads to be allocated at
3624  // subsequent parallel regions, but the worker threads hang around (and
3625  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3626  //
3627  // Anyway, that is the reason for moving the check to see if
3628  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3629  // instead of having it performed here. -BB
3630 
3631  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3632 
3633  /* compute expansion headroom to check if we can expand */
3634  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3635  /* possible expansion too small -- give up */
3636  return added;
3637  }
3638  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3639 
3640  newCapacity = __kmp_threads_capacity;
3641  do {
3642  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3643  : __kmp_sys_max_nth;
3644  } while (newCapacity < minimumRequiredCapacity);
3645  newThreads = (kmp_info_t **)__kmp_allocate(
3646  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3647  newRoot =
3648  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3649  KMP_MEMCPY(newThreads, __kmp_threads,
3650  __kmp_threads_capacity * sizeof(kmp_info_t *));
3651  KMP_MEMCPY(newRoot, __kmp_root,
3652  __kmp_threads_capacity * sizeof(kmp_root_t *));
3653 
3654  kmp_info_t **temp_threads = __kmp_threads;
3655  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3656  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3657  __kmp_free(temp_threads);
3658  added += newCapacity - __kmp_threads_capacity;
3659  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3660 
3661  if (newCapacity > __kmp_tp_capacity) {
3662  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3663  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3664  __kmp_threadprivate_resize_cache(newCapacity);
3665  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3666  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3667  }
3668  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3669  }
3670 
3671  return added;
3672 }
3673 
3674 /* Register the current thread as a root thread and obtain our gtid. We must
3675  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3676  thread that calls from __kmp_do_serial_initialize() */
3677 int __kmp_register_root(int initial_thread) {
3678  kmp_info_t *root_thread;
3679  kmp_root_t *root;
3680  int gtid;
3681  int capacity;
3682  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3683  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3684  KMP_MB();
3685 
3686  /* 2007-03-02:
3687  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3688  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3689  work as expected -- it may return false (that means there is at least one
3690  empty slot in __kmp_threads array), but it is possible the only free slot
3691  is #0, which is reserved for initial thread and so cannot be used for this
3692  one. Following code workarounds this bug.
3693 
3694  However, right solution seems to be not reserving slot #0 for initial
3695  thread because:
3696  (1) there is no magic in slot #0,
3697  (2) we cannot detect initial thread reliably (the first thread which does
3698  serial initialization may be not a real initial thread).
3699  */
3700  capacity = __kmp_threads_capacity;
3701  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3702  --capacity;
3703  }
3704 
3705  /* see if there are too many threads */
3706  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3707  if (__kmp_tp_cached) {
3708  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3709  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3710  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3711  } else {
3712  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3713  __kmp_msg_null);
3714  }
3715  }
3716 
3717  /* find an available thread slot */
3718  /* Don't reassign the zero slot since we need that to only be used by initial
3719  thread */
3720  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3721  gtid++)
3722  ;
3723  KA_TRACE(1,
3724  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3725  KMP_ASSERT(gtid < __kmp_threads_capacity);
3726 
3727  /* update global accounting */
3728  __kmp_all_nth++;
3729  TCW_4(__kmp_nth, __kmp_nth + 1);
3730 
3731  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3732  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3733  if (__kmp_adjust_gtid_mode) {
3734  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3735  if (TCR_4(__kmp_gtid_mode) != 2) {
3736  TCW_4(__kmp_gtid_mode, 2);
3737  }
3738  } else {
3739  if (TCR_4(__kmp_gtid_mode) != 1) {
3740  TCW_4(__kmp_gtid_mode, 1);
3741  }
3742  }
3743  }
3744 
3745 #ifdef KMP_ADJUST_BLOCKTIME
3746  /* Adjust blocktime to zero if necessary */
3747  /* Middle initialization might not have occurred yet */
3748  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3749  if (__kmp_nth > __kmp_avail_proc) {
3750  __kmp_zero_bt = TRUE;
3751  }
3752  }
3753 #endif /* KMP_ADJUST_BLOCKTIME */
3754 
3755  /* setup this new hierarchy */
3756  if (!(root = __kmp_root[gtid])) {
3757  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3758  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3759  }
3760 
3761 #if KMP_STATS_ENABLED
3762  // Initialize stats as soon as possible (right after gtid assignment).
3763  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3764  __kmp_stats_thread_ptr->startLife();
3765  KMP_SET_THREAD_STATE(SERIAL_REGION);
3766  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3767 #endif
3768  __kmp_initialize_root(root);
3769 
3770  /* setup new root thread structure */
3771  if (root->r.r_uber_thread) {
3772  root_thread = root->r.r_uber_thread;
3773  } else {
3774  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3775  if (__kmp_storage_map) {
3776  __kmp_print_thread_storage_map(root_thread, gtid);
3777  }
3778  root_thread->th.th_info.ds.ds_gtid = gtid;
3779 #if OMPT_SUPPORT
3780  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3781 #endif
3782  root_thread->th.th_root = root;
3783  if (__kmp_env_consistency_check) {
3784  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3785  }
3786 #if USE_FAST_MEMORY
3787  __kmp_initialize_fast_memory(root_thread);
3788 #endif /* USE_FAST_MEMORY */
3789 
3790 #if KMP_USE_BGET
3791  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3792  __kmp_initialize_bget(root_thread);
3793 #endif
3794  __kmp_init_random(root_thread); // Initialize random number generator
3795  }
3796 
3797  /* setup the serial team held in reserve by the root thread */
3798  if (!root_thread->th.th_serial_team) {
3799  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3800  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3801  root_thread->th.th_serial_team =
3802  __kmp_allocate_team(root, 1, 1,
3803 #if OMPT_SUPPORT
3804  ompt_data_none, // root parallel id
3805 #endif
3806 #if OMP_40_ENABLED
3807  proc_bind_default,
3808 #endif
3809  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3810  }
3811  KMP_ASSERT(root_thread->th.th_serial_team);
3812  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3813  root_thread->th.th_serial_team));
3814 
3815  /* drop root_thread into place */
3816  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3817 
3818  root->r.r_root_team->t.t_threads[0] = root_thread;
3819  root->r.r_hot_team->t.t_threads[0] = root_thread;
3820  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3821  // AC: the team created in reserve, not for execution (it is unused for now).
3822  root_thread->th.th_serial_team->t.t_serialized = 0;
3823  root->r.r_uber_thread = root_thread;
3824 
3825  /* initialize the thread, get it ready to go */
3826  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3827  TCW_4(__kmp_init_gtid, TRUE);
3828 
3829  /* prepare the master thread for get_gtid() */
3830  __kmp_gtid_set_specific(gtid);
3831 
3832 #if USE_ITT_BUILD
3833  __kmp_itt_thread_name(gtid);
3834 #endif /* USE_ITT_BUILD */
3835 
3836 #ifdef KMP_TDATA_GTID
3837  __kmp_gtid = gtid;
3838 #endif
3839  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3840  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3841 
3842  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3843  "plain=%u\n",
3844  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3845  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3846  KMP_INIT_BARRIER_STATE));
3847  { // Initialize barrier data.
3848  int b;
3849  for (b = 0; b < bs_last_barrier; ++b) {
3850  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3851 #if USE_DEBUGGER
3852  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3853 #endif
3854  }
3855  }
3856  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3857  KMP_INIT_BARRIER_STATE);
3858 
3859 #if KMP_AFFINITY_SUPPORTED
3860 #if OMP_40_ENABLED
3861  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3862  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3863  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3864  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3865 #endif
3866  if (TCR_4(__kmp_init_middle)) {
3867  __kmp_affinity_set_init_mask(gtid, TRUE);
3868  }
3869 #endif /* KMP_AFFINITY_SUPPORTED */
3870 #if OMP_50_ENABLED
3871  root_thread->th.th_def_allocator = __kmp_def_allocator;
3872  root_thread->th.th_prev_level = 0;
3873  root_thread->th.th_prev_num_threads = 1;
3874 #endif
3875 
3876  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3877  tmp->cg_root = root_thread;
3878  tmp->cg_thread_limit = __kmp_cg_max_nth;
3879  tmp->cg_nthreads = 1;
3880  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3881  " cg_nthreads init to 1\n",
3882  root_thread, tmp));
3883  tmp->up = NULL;
3884  root_thread->th.th_cg_roots = tmp;
3885 
3886  __kmp_root_counter++;
3887 
3888 #if OMPT_SUPPORT
3889  if (!initial_thread && ompt_enabled.enabled) {
3890 
3891  kmp_info_t *root_thread = ompt_get_thread();
3892 
3893  ompt_set_thread_state(root_thread, ompt_state_overhead);
3894 
3895  if (ompt_enabled.ompt_callback_thread_begin) {
3896  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3897  ompt_thread_initial, __ompt_get_thread_data_internal());
3898  }
3899  ompt_data_t *task_data;
3900  ompt_data_t *parallel_data;
3901  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3902  if (ompt_enabled.ompt_callback_implicit_task) {
3903  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3904  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3905  }
3906 
3907  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3908  }
3909 #endif
3910 
3911  KMP_MB();
3912  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3913 
3914  return gtid;
3915 }
3916 
3917 #if KMP_NESTED_HOT_TEAMS
3918 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3919  const int max_level) {
3920  int i, n, nth;
3921  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3922  if (!hot_teams || !hot_teams[level].hot_team) {
3923  return 0;
3924  }
3925  KMP_DEBUG_ASSERT(level < max_level);
3926  kmp_team_t *team = hot_teams[level].hot_team;
3927  nth = hot_teams[level].hot_team_nth;
3928  n = nth - 1; // master is not freed
3929  if (level < max_level - 1) {
3930  for (i = 0; i < nth; ++i) {
3931  kmp_info_t *th = team->t.t_threads[i];
3932  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3933  if (i > 0 && th->th.th_hot_teams) {
3934  __kmp_free(th->th.th_hot_teams);
3935  th->th.th_hot_teams = NULL;
3936  }
3937  }
3938  }
3939  __kmp_free_team(root, team, NULL);
3940  return n;
3941 }
3942 #endif
3943 
3944 // Resets a root thread and clear its root and hot teams.
3945 // Returns the number of __kmp_threads entries directly and indirectly freed.
3946 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3947  kmp_team_t *root_team = root->r.r_root_team;
3948  kmp_team_t *hot_team = root->r.r_hot_team;
3949  int n = hot_team->t.t_nproc;
3950  int i;
3951 
3952  KMP_DEBUG_ASSERT(!root->r.r_active);
3953 
3954  root->r.r_root_team = NULL;
3955  root->r.r_hot_team = NULL;
3956  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3957  // before call to __kmp_free_team().
3958  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3959 #if KMP_NESTED_HOT_TEAMS
3960  if (__kmp_hot_teams_max_level >
3961  0) { // need to free nested hot teams and their threads if any
3962  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3963  kmp_info_t *th = hot_team->t.t_threads[i];
3964  if (__kmp_hot_teams_max_level > 1) {
3965  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3966  }
3967  if (th->th.th_hot_teams) {
3968  __kmp_free(th->th.th_hot_teams);
3969  th->th.th_hot_teams = NULL;
3970  }
3971  }
3972  }
3973 #endif
3974  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3975 
3976  // Before we can reap the thread, we need to make certain that all other
3977  // threads in the teams that had this root as ancestor have stopped trying to
3978  // steal tasks.
3979  if (__kmp_tasking_mode != tskm_immediate_exec) {
3980  __kmp_wait_to_unref_task_teams();
3981  }
3982 
3983 #if KMP_OS_WINDOWS
3984  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3985  KA_TRACE(
3986  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3987  "\n",
3988  (LPVOID) & (root->r.r_uber_thread->th),
3989  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3990  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3991 #endif /* KMP_OS_WINDOWS */
3992 
3993 #if OMPT_SUPPORT
3994  ompt_data_t *task_data;
3995  ompt_data_t *parallel_data;
3996  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3997  if (ompt_enabled.ompt_callback_implicit_task) {
3998  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3999  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4000  }
4001  if (ompt_enabled.ompt_callback_thread_end) {
4002  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4003  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4004  }
4005 #endif
4006 
4007  TCW_4(__kmp_nth,
4008  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4009  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4010  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4011  " to %d\n",
4012  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4013  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4014  if (i == 1) {
4015  // need to free contention group structure
4016  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4017  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4018  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4019  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4020  root->r.r_uber_thread->th.th_cg_roots = NULL;
4021  }
4022  __kmp_reap_thread(root->r.r_uber_thread, 1);
4023 
4024  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
4025  // of freeing.
4026  root->r.r_uber_thread = NULL;
4027  /* mark root as no longer in use */
4028  root->r.r_begin = FALSE;
4029 
4030  return n;
4031 }
4032 
4033 void __kmp_unregister_root_current_thread(int gtid) {
4034  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4035  /* this lock should be ok, since unregister_root_current_thread is never
4036  called during an abort, only during a normal close. furthermore, if you
4037  have the forkjoin lock, you should never try to get the initz lock */
4038  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4039  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4040  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4041  "exiting T#%d\n",
4042  gtid));
4043  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4044  return;
4045  }
4046  kmp_root_t *root = __kmp_root[gtid];
4047 
4048  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4049  KMP_ASSERT(KMP_UBER_GTID(gtid));
4050  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4051  KMP_ASSERT(root->r.r_active == FALSE);
4052 
4053  KMP_MB();
4054 
4055 #if OMP_45_ENABLED
4056  kmp_info_t *thread = __kmp_threads[gtid];
4057  kmp_team_t *team = thread->th.th_team;
4058  kmp_task_team_t *task_team = thread->th.th_task_team;
4059 
4060  // we need to wait for the proxy tasks before finishing the thread
4061  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4062 #if OMPT_SUPPORT
4063  // the runtime is shutting down so we won't report any events
4064  thread->th.ompt_thread_info.state = ompt_state_undefined;
4065 #endif
4066  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4067  }
4068 #endif
4069 
4070  __kmp_reset_root(gtid, root);
4071 
4072  /* free up this thread slot */
4073  __kmp_gtid_set_specific(KMP_GTID_DNE);
4074 #ifdef KMP_TDATA_GTID
4075  __kmp_gtid = KMP_GTID_DNE;
4076 #endif
4077 
4078  KMP_MB();
4079  KC_TRACE(10,
4080  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4081 
4082  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4083 }
4084 
4085 #if KMP_OS_WINDOWS
4086 /* __kmp_forkjoin_lock must be already held
4087  Unregisters a root thread that is not the current thread. Returns the number
4088  of __kmp_threads entries freed as a result. */
4089 static int __kmp_unregister_root_other_thread(int gtid) {
4090  kmp_root_t *root = __kmp_root[gtid];
4091  int r;
4092 
4093  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4094  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4095  KMP_ASSERT(KMP_UBER_GTID(gtid));
4096  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4097  KMP_ASSERT(root->r.r_active == FALSE);
4098 
4099  r = __kmp_reset_root(gtid, root);
4100  KC_TRACE(10,
4101  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4102  return r;
4103 }
4104 #endif
4105 
4106 #if KMP_DEBUG
4107 void __kmp_task_info() {
4108 
4109  kmp_int32 gtid = __kmp_entry_gtid();
4110  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4111  kmp_info_t *this_thr = __kmp_threads[gtid];
4112  kmp_team_t *steam = this_thr->th.th_serial_team;
4113  kmp_team_t *team = this_thr->th.th_team;
4114 
4115  __kmp_printf(
4116  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4117  "ptask=%p\n",
4118  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4119  team->t.t_implicit_task_taskdata[tid].td_parent);
4120 }
4121 #endif // KMP_DEBUG
4122 
4123 /* TODO optimize with one big memclr, take out what isn't needed, split
4124  responsibility to workers as much as possible, and delay initialization of
4125  features as much as possible */
4126 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4127  int tid, int gtid) {
4128  /* this_thr->th.th_info.ds.ds_gtid is setup in
4129  kmp_allocate_thread/create_worker.
4130  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4131  kmp_info_t *master = team->t.t_threads[0];
4132  KMP_DEBUG_ASSERT(this_thr != NULL);
4133  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4134  KMP_DEBUG_ASSERT(team);
4135  KMP_DEBUG_ASSERT(team->t.t_threads);
4136  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4137  KMP_DEBUG_ASSERT(master);
4138  KMP_DEBUG_ASSERT(master->th.th_root);
4139 
4140  KMP_MB();
4141 
4142  TCW_SYNC_PTR(this_thr->th.th_team, team);
4143 
4144  this_thr->th.th_info.ds.ds_tid = tid;
4145  this_thr->th.th_set_nproc = 0;
4146  if (__kmp_tasking_mode != tskm_immediate_exec)
4147  // When tasking is possible, threads are not safe to reap until they are
4148  // done tasking; this will be set when tasking code is exited in wait
4149  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4150  else // no tasking --> always safe to reap
4151  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4152 #if OMP_40_ENABLED
4153  this_thr->th.th_set_proc_bind = proc_bind_default;
4154 #if KMP_AFFINITY_SUPPORTED
4155  this_thr->th.th_new_place = this_thr->th.th_current_place;
4156 #endif
4157 #endif
4158  this_thr->th.th_root = master->th.th_root;
4159 
4160  /* setup the thread's cache of the team structure */
4161  this_thr->th.th_team_nproc = team->t.t_nproc;
4162  this_thr->th.th_team_master = master;
4163  this_thr->th.th_team_serialized = team->t.t_serialized;
4164  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4165 
4166  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4167 
4168  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4169  tid, gtid, this_thr, this_thr->th.th_current_task));
4170 
4171  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4172  team, tid, TRUE);
4173 
4174  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4175  tid, gtid, this_thr, this_thr->th.th_current_task));
4176  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4177  // __kmp_initialize_team()?
4178 
4179  /* TODO no worksharing in speculative threads */
4180  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4181 
4182  this_thr->th.th_local.this_construct = 0;
4183 
4184  if (!this_thr->th.th_pri_common) {
4185  this_thr->th.th_pri_common =
4186  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4187  if (__kmp_storage_map) {
4188  __kmp_print_storage_map_gtid(
4189  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4190  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4191  }
4192  this_thr->th.th_pri_head = NULL;
4193  }
4194 
4195  if (this_thr != master && // Master's CG root is initialized elsewhere
4196  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4197  // Make new thread's CG root same as master's
4198  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4199  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4200  // Increment new thread's CG root's counter to add the new thread
4201  this_thr->th.th_cg_roots->cg_nthreads++;
4202  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4203  " node %p of thread %p to %d\n",
4204  this_thr, this_thr->th.th_cg_roots,
4205  this_thr->th.th_cg_roots->cg_root,
4206  this_thr->th.th_cg_roots->cg_nthreads));
4207  this_thr->th.th_current_task->td_icvs.thread_limit =
4208  this_thr->th.th_cg_roots->cg_thread_limit;
4209  }
4210 
4211  /* Initialize dynamic dispatch */
4212  {
4213  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4214  // Use team max_nproc since this will never change for the team.
4215  size_t disp_size =
4216  sizeof(dispatch_private_info_t) *
4217  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4218  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4219  team->t.t_max_nproc));
4220  KMP_ASSERT(dispatch);
4221  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4222  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4223 
4224  dispatch->th_disp_index = 0;
4225 #if OMP_45_ENABLED
4226  dispatch->th_doacross_buf_idx = 0;
4227 #endif
4228  if (!dispatch->th_disp_buffer) {
4229  dispatch->th_disp_buffer =
4230  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4231 
4232  if (__kmp_storage_map) {
4233  __kmp_print_storage_map_gtid(
4234  gtid, &dispatch->th_disp_buffer[0],
4235  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4236  ? 1
4237  : __kmp_dispatch_num_buffers],
4238  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4239  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4240  gtid, team->t.t_id, gtid);
4241  }
4242  } else {
4243  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4244  }
4245 
4246  dispatch->th_dispatch_pr_current = 0;
4247  dispatch->th_dispatch_sh_current = 0;
4248 
4249  dispatch->th_deo_fcn = 0; /* ORDERED */
4250  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4251  }
4252 
4253  this_thr->th.th_next_pool = NULL;
4254 
4255  if (!this_thr->th.th_task_state_memo_stack) {
4256  size_t i;
4257  this_thr->th.th_task_state_memo_stack =
4258  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4259  this_thr->th.th_task_state_top = 0;
4260  this_thr->th.th_task_state_stack_sz = 4;
4261  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4262  ++i) // zero init the stack
4263  this_thr->th.th_task_state_memo_stack[i] = 0;
4264  }
4265 
4266  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4267  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4268 
4269  KMP_MB();
4270 }
4271 
4272 /* allocate a new thread for the requesting team. this is only called from
4273  within a forkjoin critical section. we will first try to get an available
4274  thread from the thread pool. if none is available, we will fork a new one
4275  assuming we are able to create a new one. this should be assured, as the
4276  caller should check on this first. */
4277 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4278  int new_tid) {
4279  kmp_team_t *serial_team;
4280  kmp_info_t *new_thr;
4281  int new_gtid;
4282 
4283  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4284  KMP_DEBUG_ASSERT(root && team);
4285 #if !KMP_NESTED_HOT_TEAMS
4286  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4287 #endif
4288  KMP_MB();
4289 
4290  /* first, try to get one from the thread pool */
4291  if (__kmp_thread_pool) {
4292  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4293  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4294  if (new_thr == __kmp_thread_pool_insert_pt) {
4295  __kmp_thread_pool_insert_pt = NULL;
4296  }
4297  TCW_4(new_thr->th.th_in_pool, FALSE);
4298  __kmp_suspend_initialize_thread(new_thr);
4299  __kmp_lock_suspend_mx(new_thr);
4300  if (new_thr->th.th_active_in_pool == TRUE) {
4301  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4302  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4303  new_thr->th.th_active_in_pool = FALSE;
4304  }
4305  __kmp_unlock_suspend_mx(new_thr);
4306 
4307  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4308  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4309  KMP_ASSERT(!new_thr->th.th_team);
4310  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4311 
4312  /* setup the thread structure */
4313  __kmp_initialize_info(new_thr, team, new_tid,
4314  new_thr->th.th_info.ds.ds_gtid);
4315  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4316 
4317  TCW_4(__kmp_nth, __kmp_nth + 1);
4318 
4319  new_thr->th.th_task_state = 0;
4320  new_thr->th.th_task_state_top = 0;
4321  new_thr->th.th_task_state_stack_sz = 4;
4322 
4323 #ifdef KMP_ADJUST_BLOCKTIME
4324  /* Adjust blocktime back to zero if necessary */
4325  /* Middle initialization might not have occurred yet */
4326  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4327  if (__kmp_nth > __kmp_avail_proc) {
4328  __kmp_zero_bt = TRUE;
4329  }
4330  }
4331 #endif /* KMP_ADJUST_BLOCKTIME */
4332 
4333 #if KMP_DEBUG
4334  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4335  // KMP_BARRIER_PARENT_FLAG.
4336  int b;
4337  kmp_balign_t *balign = new_thr->th.th_bar;
4338  for (b = 0; b < bs_last_barrier; ++b)
4339  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4340 #endif
4341 
4342  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4343  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4344 
4345  KMP_MB();
4346  return new_thr;
4347  }
4348 
4349  /* no, well fork a new one */
4350  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4351  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4352 
4353 #if KMP_USE_MONITOR
4354  // If this is the first worker thread the RTL is creating, then also
4355  // launch the monitor thread. We try to do this as early as possible.
4356  if (!TCR_4(__kmp_init_monitor)) {
4357  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4358  if (!TCR_4(__kmp_init_monitor)) {
4359  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4360  TCW_4(__kmp_init_monitor, 1);
4361  __kmp_create_monitor(&__kmp_monitor);
4362  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4363 #if KMP_OS_WINDOWS
4364  // AC: wait until monitor has started. This is a fix for CQ232808.
4365  // The reason is that if the library is loaded/unloaded in a loop with
4366  // small (parallel) work in between, then there is high probability that
4367  // monitor thread started after the library shutdown. At shutdown it is
4368  // too late to cope with the problem, because when the master is in
4369  // DllMain (process detach) the monitor has no chances to start (it is
4370  // blocked), and master has no means to inform the monitor that the
4371  // library has gone, because all the memory which the monitor can access
4372  // is going to be released/reset.
4373  while (TCR_4(__kmp_init_monitor) < 2) {
4374  KMP_YIELD(TRUE);
4375  }
4376  KF_TRACE(10, ("after monitor thread has started\n"));
4377 #endif
4378  }
4379  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4380  }
4381 #endif
4382 
4383  KMP_MB();
4384  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4385  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4386  }
4387 
4388  /* allocate space for it. */
4389  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4390 
4391  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4392 
4393  if (__kmp_storage_map) {
4394  __kmp_print_thread_storage_map(new_thr, new_gtid);
4395  }
4396 
4397  // add the reserve serialized team, initialized from the team's master thread
4398  {
4399  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4400  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4401  new_thr->th.th_serial_team = serial_team =
4402  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4403 #if OMPT_SUPPORT
4404  ompt_data_none, // root parallel id
4405 #endif
4406 #if OMP_40_ENABLED
4407  proc_bind_default,
4408 #endif
4409  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4410  }
4411  KMP_ASSERT(serial_team);
4412  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4413  // execution (it is unused for now).
4414  serial_team->t.t_threads[0] = new_thr;
4415  KF_TRACE(10,
4416  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4417  new_thr));
4418 
4419  /* setup the thread structures */
4420  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4421 
4422 #if USE_FAST_MEMORY
4423  __kmp_initialize_fast_memory(new_thr);
4424 #endif /* USE_FAST_MEMORY */
4425 
4426 #if KMP_USE_BGET
4427  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4428  __kmp_initialize_bget(new_thr);
4429 #endif
4430 
4431  __kmp_init_random(new_thr); // Initialize random number generator
4432 
4433  /* Initialize these only once when thread is grabbed for a team allocation */
4434  KA_TRACE(20,
4435  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4436  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4437 
4438  int b;
4439  kmp_balign_t *balign = new_thr->th.th_bar;
4440  for (b = 0; b < bs_last_barrier; ++b) {
4441  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4442  balign[b].bb.team = NULL;
4443  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4444  balign[b].bb.use_oncore_barrier = 0;
4445  }
4446 
4447  new_thr->th.th_spin_here = FALSE;
4448  new_thr->th.th_next_waiting = 0;
4449 #if KMP_OS_UNIX
4450  new_thr->th.th_blocking = false;
4451 #endif
4452 
4453 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4454  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4455  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4456  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4457  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4458 #endif
4459 #if OMP_50_ENABLED
4460  new_thr->th.th_def_allocator = __kmp_def_allocator;
4461  new_thr->th.th_prev_level = 0;
4462  new_thr->th.th_prev_num_threads = 1;
4463 #endif
4464 
4465  TCW_4(new_thr->th.th_in_pool, FALSE);
4466  new_thr->th.th_active_in_pool = FALSE;
4467  TCW_4(new_thr->th.th_active, TRUE);
4468 
4469  /* adjust the global counters */
4470  __kmp_all_nth++;
4471  __kmp_nth++;
4472 
4473  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4474  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4475  if (__kmp_adjust_gtid_mode) {
4476  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4477  if (TCR_4(__kmp_gtid_mode) != 2) {
4478  TCW_4(__kmp_gtid_mode, 2);
4479  }
4480  } else {
4481  if (TCR_4(__kmp_gtid_mode) != 1) {
4482  TCW_4(__kmp_gtid_mode, 1);
4483  }
4484  }
4485  }
4486 
4487 #ifdef KMP_ADJUST_BLOCKTIME
4488  /* Adjust blocktime back to zero if necessary */
4489  /* Middle initialization might not have occurred yet */
4490  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4491  if (__kmp_nth > __kmp_avail_proc) {
4492  __kmp_zero_bt = TRUE;
4493  }
4494  }
4495 #endif /* KMP_ADJUST_BLOCKTIME */
4496 
4497  /* actually fork it and create the new worker thread */
4498  KF_TRACE(
4499  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4500  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4501  KF_TRACE(10,
4502  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4503 
4504  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4505  new_gtid));
4506  KMP_MB();
4507  return new_thr;
4508 }
4509 
4510 /* Reinitialize team for reuse.
4511  The hot team code calls this case at every fork barrier, so EPCC barrier
4512  test are extremely sensitive to changes in it, esp. writes to the team
4513  struct, which cause a cache invalidation in all threads.
4514  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4515 static void __kmp_reinitialize_team(kmp_team_t *team,
4516  kmp_internal_control_t *new_icvs,
4517  ident_t *loc) {
4518  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4519  team->t.t_threads[0], team));
4520  KMP_DEBUG_ASSERT(team && new_icvs);
4521  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4522  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4523 
4524  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4525  // Copy ICVs to the master thread's implicit taskdata
4526  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4527  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4528 
4529  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4530  team->t.t_threads[0], team));
4531 }
4532 
4533 /* Initialize the team data structure.
4534  This assumes the t_threads and t_max_nproc are already set.
4535  Also, we don't touch the arguments */
4536 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4537  kmp_internal_control_t *new_icvs,
4538  ident_t *loc) {
4539  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4540 
4541  /* verify */
4542  KMP_DEBUG_ASSERT(team);
4543  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4544  KMP_DEBUG_ASSERT(team->t.t_threads);
4545  KMP_MB();
4546 
4547  team->t.t_master_tid = 0; /* not needed */
4548  /* team->t.t_master_bar; not needed */
4549  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4550  team->t.t_nproc = new_nproc;
4551 
4552  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4553  team->t.t_next_pool = NULL;
4554  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4555  * up hot team */
4556 
4557  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4558  team->t.t_invoke = NULL; /* not needed */
4559 
4560  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4561  team->t.t_sched.sched = new_icvs->sched.sched;
4562 
4563 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4564  team->t.t_fp_control_saved = FALSE; /* not needed */
4565  team->t.t_x87_fpu_control_word = 0; /* not needed */
4566  team->t.t_mxcsr = 0; /* not needed */
4567 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4568 
4569  team->t.t_construct = 0;
4570 
4571  team->t.t_ordered.dt.t_value = 0;
4572  team->t.t_master_active = FALSE;
4573 
4574 #ifdef KMP_DEBUG
4575  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4576 #endif
4577 #if KMP_OS_WINDOWS
4578  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4579 #endif
4580 
4581  team->t.t_control_stack_top = NULL;
4582 
4583  __kmp_reinitialize_team(team, new_icvs, loc);
4584 
4585  KMP_MB();
4586  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4587 }
4588 
4589 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4590 /* Sets full mask for thread and returns old mask, no changes to structures. */
4591 static void
4592 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4593  if (KMP_AFFINITY_CAPABLE()) {
4594  int status;
4595  if (old_mask != NULL) {
4596  status = __kmp_get_system_affinity(old_mask, TRUE);
4597  int error = errno;
4598  if (status != 0) {
4599  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4600  __kmp_msg_null);
4601  }
4602  }
4603  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4604  }
4605 }
4606 #endif
4607 
4608 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4609 
4610 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4611 // It calculats the worker + master thread's partition based upon the parent
4612 // thread's partition, and binds each worker to a thread in their partition.
4613 // The master thread's partition should already include its current binding.
4614 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4615  // Copy the master thread's place partion to the team struct
4616  kmp_info_t *master_th = team->t.t_threads[0];
4617  KMP_DEBUG_ASSERT(master_th != NULL);
4618  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4619  int first_place = master_th->th.th_first_place;
4620  int last_place = master_th->th.th_last_place;
4621  int masters_place = master_th->th.th_current_place;
4622  team->t.t_first_place = first_place;
4623  team->t.t_last_place = last_place;
4624 
4625  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4626  "bound to place %d partition = [%d,%d]\n",
4627  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4628  team->t.t_id, masters_place, first_place, last_place));
4629 
4630  switch (proc_bind) {
4631 
4632  case proc_bind_default:
4633  // serial teams might have the proc_bind policy set to proc_bind_default. It
4634  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4635  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4636  break;
4637 
4638  case proc_bind_master: {
4639  int f;
4640  int n_th = team->t.t_nproc;
4641  for (f = 1; f < n_th; f++) {
4642  kmp_info_t *th = team->t.t_threads[f];
4643  KMP_DEBUG_ASSERT(th != NULL);
4644  th->th.th_first_place = first_place;
4645  th->th.th_last_place = last_place;
4646  th->th.th_new_place = masters_place;
4647 #if OMP_50_ENABLED
4648  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4649  team->t.t_display_affinity != 1) {
4650  team->t.t_display_affinity = 1;
4651  }
4652 #endif
4653 
4654  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4655  "partition = [%d,%d]\n",
4656  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4657  f, masters_place, first_place, last_place));
4658  }
4659  } break;
4660 
4661  case proc_bind_close: {
4662  int f;
4663  int n_th = team->t.t_nproc;
4664  int n_places;
4665  if (first_place <= last_place) {
4666  n_places = last_place - first_place + 1;
4667  } else {
4668  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4669  }
4670  if (n_th <= n_places) {
4671  int place = masters_place;
4672  for (f = 1; f < n_th; f++) {
4673  kmp_info_t *th = team->t.t_threads[f];
4674  KMP_DEBUG_ASSERT(th != NULL);
4675 
4676  if (place == last_place) {
4677  place = first_place;
4678  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4679  place = 0;
4680  } else {
4681  place++;
4682  }
4683  th->th.th_first_place = first_place;
4684  th->th.th_last_place = last_place;
4685  th->th.th_new_place = place;
4686 #if OMP_50_ENABLED
4687  if (__kmp_display_affinity && place != th->th.th_current_place &&
4688  team->t.t_display_affinity != 1) {
4689  team->t.t_display_affinity = 1;
4690  }
4691 #endif
4692 
4693  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4694  "partition = [%d,%d]\n",
4695  __kmp_gtid_from_thread(team->t.t_threads[f]),
4696  team->t.t_id, f, place, first_place, last_place));
4697  }
4698  } else {
4699  int S, rem, gap, s_count;
4700  S = n_th / n_places;
4701  s_count = 0;
4702  rem = n_th - (S * n_places);
4703  gap = rem > 0 ? n_places / rem : n_places;
4704  int place = masters_place;
4705  int gap_ct = gap;
4706  for (f = 0; f < n_th; f++) {
4707  kmp_info_t *th = team->t.t_threads[f];
4708  KMP_DEBUG_ASSERT(th != NULL);
4709 
4710  th->th.th_first_place = first_place;
4711  th->th.th_last_place = last_place;
4712  th->th.th_new_place = place;
4713 #if OMP_50_ENABLED
4714  if (__kmp_display_affinity && place != th->th.th_current_place &&
4715  team->t.t_display_affinity != 1) {
4716  team->t.t_display_affinity = 1;
4717  }
4718 #endif
4719  s_count++;
4720 
4721  if ((s_count == S) && rem && (gap_ct == gap)) {
4722  // do nothing, add an extra thread to place on next iteration
4723  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4724  // we added an extra thread to this place; move to next place
4725  if (place == last_place) {
4726  place = first_place;
4727  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4728  place = 0;
4729  } else {
4730  place++;
4731  }
4732  s_count = 0;
4733  gap_ct = 1;
4734  rem--;
4735  } else if (s_count == S) { // place full; don't add extra
4736  if (place == last_place) {
4737  place = first_place;
4738  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4739  place = 0;
4740  } else {
4741  place++;
4742  }
4743  gap_ct++;
4744  s_count = 0;
4745  }
4746 
4747  KA_TRACE(100,
4748  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4749  "partition = [%d,%d]\n",
4750  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4751  th->th.th_new_place, first_place, last_place));
4752  }
4753  KMP_DEBUG_ASSERT(place == masters_place);
4754  }
4755  } break;
4756 
4757  case proc_bind_spread: {
4758  int f;
4759  int n_th = team->t.t_nproc;
4760  int n_places;
4761  int thidx;
4762  if (first_place <= last_place) {
4763  n_places = last_place - first_place + 1;
4764  } else {
4765  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4766  }
4767  if (n_th <= n_places) {
4768  int place = -1;
4769 
4770  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4771  int S = n_places / n_th;
4772  int s_count, rem, gap, gap_ct;
4773 
4774  place = masters_place;
4775  rem = n_places - n_th * S;
4776  gap = rem ? n_th / rem : 1;
4777  gap_ct = gap;
4778  thidx = n_th;
4779  if (update_master_only == 1)
4780  thidx = 1;
4781  for (f = 0; f < thidx; f++) {
4782  kmp_info_t *th = team->t.t_threads[f];
4783  KMP_DEBUG_ASSERT(th != NULL);
4784 
4785  th->th.th_first_place = place;
4786  th->th.th_new_place = place;
4787 #if OMP_50_ENABLED
4788  if (__kmp_display_affinity && place != th->th.th_current_place &&
4789  team->t.t_display_affinity != 1) {
4790  team->t.t_display_affinity = 1;
4791  }
4792 #endif
4793  s_count = 1;
4794  while (s_count < S) {
4795  if (place == last_place) {
4796  place = first_place;
4797  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798  place = 0;
4799  } else {
4800  place++;
4801  }
4802  s_count++;
4803  }
4804  if (rem && (gap_ct == gap)) {
4805  if (place == last_place) {
4806  place = first_place;
4807  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4808  place = 0;
4809  } else {
4810  place++;
4811  }
4812  rem--;
4813  gap_ct = 0;
4814  }
4815  th->th.th_last_place = place;
4816  gap_ct++;
4817 
4818  if (place == last_place) {
4819  place = first_place;
4820  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4821  place = 0;
4822  } else {
4823  place++;
4824  }
4825 
4826  KA_TRACE(100,
4827  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4829  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4830  f, th->th.th_new_place, th->th.th_first_place,
4831  th->th.th_last_place, __kmp_affinity_num_masks));
4832  }
4833  } else {
4834  /* Having uniform space of available computation places I can create
4835  T partitions of round(P/T) size and put threads into the first
4836  place of each partition. */
4837  double current = static_cast<double>(masters_place);
4838  double spacing =
4839  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4840  int first, last;
4841  kmp_info_t *th;
4842 
4843  thidx = n_th + 1;
4844  if (update_master_only == 1)
4845  thidx = 1;
4846  for (f = 0; f < thidx; f++) {
4847  first = static_cast<int>(current);
4848  last = static_cast<int>(current + spacing) - 1;
4849  KMP_DEBUG_ASSERT(last >= first);
4850  if (first >= n_places) {
4851  if (masters_place) {
4852  first -= n_places;
4853  last -= n_places;
4854  if (first == (masters_place + 1)) {
4855  KMP_DEBUG_ASSERT(f == n_th);
4856  first--;
4857  }
4858  if (last == masters_place) {
4859  KMP_DEBUG_ASSERT(f == (n_th - 1));
4860  last--;
4861  }
4862  } else {
4863  KMP_DEBUG_ASSERT(f == n_th);
4864  first = 0;
4865  last = 0;
4866  }
4867  }
4868  if (last >= n_places) {
4869  last = (n_places - 1);
4870  }
4871  place = first;
4872  current += spacing;
4873  if (f < n_th) {
4874  KMP_DEBUG_ASSERT(0 <= first);
4875  KMP_DEBUG_ASSERT(n_places > first);
4876  KMP_DEBUG_ASSERT(0 <= last);
4877  KMP_DEBUG_ASSERT(n_places > last);
4878  KMP_DEBUG_ASSERT(last_place >= first_place);
4879  th = team->t.t_threads[f];
4880  KMP_DEBUG_ASSERT(th);
4881  th->th.th_first_place = first;
4882  th->th.th_new_place = place;
4883  th->th.th_last_place = last;
4884 #if OMP_50_ENABLED
4885  if (__kmp_display_affinity && place != th->th.th_current_place &&
4886  team->t.t_display_affinity != 1) {
4887  team->t.t_display_affinity = 1;
4888  }
4889 #endif
4890  KA_TRACE(100,
4891  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4892  "partition = [%d,%d], spacing = %.4f\n",
4893  __kmp_gtid_from_thread(team->t.t_threads[f]),
4894  team->t.t_id, f, th->th.th_new_place,
4895  th->th.th_first_place, th->th.th_last_place, spacing));
4896  }
4897  }
4898  }
4899  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4900  } else {
4901  int S, rem, gap, s_count;
4902  S = n_th / n_places;
4903  s_count = 0;
4904  rem = n_th - (S * n_places);
4905  gap = rem > 0 ? n_places / rem : n_places;
4906  int place = masters_place;
4907  int gap_ct = gap;
4908  thidx = n_th;
4909  if (update_master_only == 1)
4910  thidx = 1;
4911  for (f = 0; f < thidx; f++) {
4912  kmp_info_t *th = team->t.t_threads[f];
4913  KMP_DEBUG_ASSERT(th != NULL);
4914 
4915  th->th.th_first_place = place;
4916  th->th.th_last_place = place;
4917  th->th.th_new_place = place;
4918 #if OMP_50_ENABLED
4919  if (__kmp_display_affinity && place != th->th.th_current_place &&
4920  team->t.t_display_affinity != 1) {
4921  team->t.t_display_affinity = 1;
4922  }
4923 #endif
4924  s_count++;
4925 
4926  if ((s_count == S) && rem && (gap_ct == gap)) {
4927  // do nothing, add an extra thread to place on next iteration
4928  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4929  // we added an extra thread to this place; move on to next place
4930  if (place == last_place) {
4931  place = first_place;
4932  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4933  place = 0;
4934  } else {
4935  place++;
4936  }
4937  s_count = 0;
4938  gap_ct = 1;
4939  rem--;
4940  } else if (s_count == S) { // place is full; don't add extra thread
4941  if (place == last_place) {
4942  place = first_place;
4943  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4944  place = 0;
4945  } else {
4946  place++;
4947  }
4948  gap_ct++;
4949  s_count = 0;
4950  }
4951 
4952  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4953  "partition = [%d,%d]\n",
4954  __kmp_gtid_from_thread(team->t.t_threads[f]),
4955  team->t.t_id, f, th->th.th_new_place,
4956  th->th.th_first_place, th->th.th_last_place));
4957  }
4958  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4959  }
4960  } break;
4961 
4962  default:
4963  break;
4964  }
4965 
4966  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4967 }
4968 
4969 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4970 
4971 /* allocate a new team data structure to use. take one off of the free pool if
4972  available */
4973 kmp_team_t *
4974 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4975 #if OMPT_SUPPORT
4976  ompt_data_t ompt_parallel_data,
4977 #endif
4978 #if OMP_40_ENABLED
4979  kmp_proc_bind_t new_proc_bind,
4980 #endif
4981  kmp_internal_control_t *new_icvs,
4982  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4983  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4984  int f;
4985  kmp_team_t *team;
4986  int use_hot_team = !root->r.r_active;
4987  int level = 0;
4988 
4989  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4990  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4991  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4992  KMP_MB();
4993 
4994 #if KMP_NESTED_HOT_TEAMS
4995  kmp_hot_team_ptr_t *hot_teams;
4996  if (master) {
4997  team = master->th.th_team;
4998  level = team->t.t_active_level;
4999  if (master->th.th_teams_microtask) { // in teams construct?
5000  if (master->th.th_teams_size.nteams > 1 &&
5001  ( // #teams > 1
5002  team->t.t_pkfn ==
5003  (microtask_t)__kmp_teams_master || // inner fork of the teams
5004  master->th.th_teams_level <
5005  team->t.t_level)) { // or nested parallel inside the teams
5006  ++level; // not increment if #teams==1, or for outer fork of the teams;
5007  // increment otherwise
5008  }
5009  }
5010  hot_teams = master->th.th_hot_teams;
5011  if (level < __kmp_hot_teams_max_level && hot_teams &&
5012  hot_teams[level]
5013  .hot_team) { // hot team has already been allocated for given level
5014  use_hot_team = 1;
5015  } else {
5016  use_hot_team = 0;
5017  }
5018  }
5019 #endif
5020  // Optimization to use a "hot" team
5021  if (use_hot_team && new_nproc > 1) {
5022  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5023 #if KMP_NESTED_HOT_TEAMS
5024  team = hot_teams[level].hot_team;
5025 #else
5026  team = root->r.r_hot_team;
5027 #endif
5028 #if KMP_DEBUG
5029  if (__kmp_tasking_mode != tskm_immediate_exec) {
5030  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5031  "task_team[1] = %p before reinit\n",
5032  team->t.t_task_team[0], team->t.t_task_team[1]));
5033  }
5034 #endif
5035 
5036  // Has the number of threads changed?
5037  /* Let's assume the most common case is that the number of threads is
5038  unchanged, and put that case first. */
5039  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5040  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5041  // This case can mean that omp_set_num_threads() was called and the hot
5042  // team size was already reduced, so we check the special flag
5043  if (team->t.t_size_changed == -1) {
5044  team->t.t_size_changed = 1;
5045  } else {
5046  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5047  }
5048 
5049  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5050  kmp_r_sched_t new_sched = new_icvs->sched;
5051  // set master's schedule as new run-time schedule
5052  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5053 
5054  __kmp_reinitialize_team(team, new_icvs,
5055  root->r.r_uber_thread->th.th_ident);
5056 
5057  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5058  team->t.t_threads[0], team));
5059  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5060 
5061 #if OMP_40_ENABLED
5062 #if KMP_AFFINITY_SUPPORTED
5063  if ((team->t.t_size_changed == 0) &&
5064  (team->t.t_proc_bind == new_proc_bind)) {
5065  if (new_proc_bind == proc_bind_spread) {
5066  __kmp_partition_places(
5067  team, 1); // add flag to update only master for spread
5068  }
5069  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5070  "proc_bind = %d, partition = [%d,%d]\n",
5071  team->t.t_id, new_proc_bind, team->t.t_first_place,
5072  team->t.t_last_place));
5073  } else {
5074  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5075  __kmp_partition_places(team);
5076  }
5077 #else
5078  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5079 #endif /* KMP_AFFINITY_SUPPORTED */
5080 #endif /* OMP_40_ENABLED */
5081  } else if (team->t.t_nproc > new_nproc) {
5082  KA_TRACE(20,
5083  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5084  new_nproc));
5085 
5086  team->t.t_size_changed = 1;
5087 #if KMP_NESTED_HOT_TEAMS
5088  if (__kmp_hot_teams_mode == 0) {
5089  // AC: saved number of threads should correspond to team's value in this
5090  // mode, can be bigger in mode 1, when hot team has threads in reserve
5091  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5092  hot_teams[level].hot_team_nth = new_nproc;
5093 #endif // KMP_NESTED_HOT_TEAMS
5094  /* release the extra threads we don't need any more */
5095  for (f = new_nproc; f < team->t.t_nproc; f++) {
5096  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5097  if (__kmp_tasking_mode != tskm_immediate_exec) {
5098  // When decreasing team size, threads no longer in the team should
5099  // unref task team.
5100  team->t.t_threads[f]->th.th_task_team = NULL;
5101  }
5102  __kmp_free_thread(team->t.t_threads[f]);
5103  team->t.t_threads[f] = NULL;
5104  }
5105 #if KMP_NESTED_HOT_TEAMS
5106  } // (__kmp_hot_teams_mode == 0)
5107  else {
5108  // When keeping extra threads in team, switch threads to wait on own
5109  // b_go flag
5110  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5111  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5112  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5113  for (int b = 0; b < bs_last_barrier; ++b) {
5114  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5115  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5116  }
5117  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5118  }
5119  }
5120  }
5121 #endif // KMP_NESTED_HOT_TEAMS
5122  team->t.t_nproc = new_nproc;
5123  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5124  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5125  __kmp_reinitialize_team(team, new_icvs,
5126  root->r.r_uber_thread->th.th_ident);
5127 
5128  // Update remaining threads
5129  for (f = 0; f < new_nproc; ++f) {
5130  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5131  }
5132 
5133  // restore the current task state of the master thread: should be the
5134  // implicit task
5135  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5136  team->t.t_threads[0], team));
5137 
5138  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5139 
5140 #ifdef KMP_DEBUG
5141  for (f = 0; f < team->t.t_nproc; f++) {
5142  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5143  team->t.t_threads[f]->th.th_team_nproc ==
5144  team->t.t_nproc);
5145  }
5146 #endif
5147 
5148 #if OMP_40_ENABLED
5149  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5150 #if KMP_AFFINITY_SUPPORTED
5151  __kmp_partition_places(team);
5152 #endif
5153 #endif
5154  } else { // team->t.t_nproc < new_nproc
5155 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5156  kmp_affin_mask_t *old_mask;
5157  if (KMP_AFFINITY_CAPABLE()) {
5158  KMP_CPU_ALLOC(old_mask);
5159  }
5160 #endif
5161 
5162  KA_TRACE(20,
5163  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5164  new_nproc));
5165 
5166  team->t.t_size_changed = 1;
5167 
5168 #if KMP_NESTED_HOT_TEAMS
5169  int avail_threads = hot_teams[level].hot_team_nth;
5170  if (new_nproc < avail_threads)
5171  avail_threads = new_nproc;
5172  kmp_info_t **other_threads = team->t.t_threads;
5173  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5174  // Adjust barrier data of reserved threads (if any) of the team
5175  // Other data will be set in __kmp_initialize_info() below.
5176  int b;
5177  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5178  for (b = 0; b < bs_last_barrier; ++b) {
5179  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5180  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5181 #if USE_DEBUGGER
5182  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5183 #endif
5184  }
5185  }
5186  if (hot_teams[level].hot_team_nth >= new_nproc) {
5187  // we have all needed threads in reserve, no need to allocate any
5188  // this only possible in mode 1, cannot have reserved threads in mode 0
5189  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5190  team->t.t_nproc = new_nproc; // just get reserved threads involved
5191  } else {
5192  // we may have some threads in reserve, but not enough
5193  team->t.t_nproc =
5194  hot_teams[level]
5195  .hot_team_nth; // get reserved threads involved if any
5196  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5197 #endif // KMP_NESTED_HOT_TEAMS
5198  if (team->t.t_max_nproc < new_nproc) {
5199  /* reallocate larger arrays */
5200  __kmp_reallocate_team_arrays(team, new_nproc);
5201  __kmp_reinitialize_team(team, new_icvs, NULL);
5202  }
5203 
5204 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5205  /* Temporarily set full mask for master thread before creation of
5206  workers. The reason is that workers inherit the affinity from master,
5207  so if a lot of workers are created on the single core quickly, they
5208  don't get a chance to set their own affinity for a long time. */
5209  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5210 #endif
5211 
5212  /* allocate new threads for the hot team */
5213  for (f = team->t.t_nproc; f < new_nproc; f++) {
5214  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5215  KMP_DEBUG_ASSERT(new_worker);
5216  team->t.t_threads[f] = new_worker;
5217 
5218  KA_TRACE(20,
5219  ("__kmp_allocate_team: team %d init T#%d arrived: "
5220  "join=%llu, plain=%llu\n",
5221  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5222  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5223  team->t.t_bar[bs_plain_barrier].b_arrived));
5224 
5225  { // Initialize barrier data for new threads.
5226  int b;
5227  kmp_balign_t *balign = new_worker->th.th_bar;
5228  for (b = 0; b < bs_last_barrier; ++b) {
5229  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5230  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5231  KMP_BARRIER_PARENT_FLAG);
5232 #if USE_DEBUGGER
5233  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5234 #endif
5235  }
5236  }
5237  }
5238 
5239 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5240  if (KMP_AFFINITY_CAPABLE()) {
5241  /* Restore initial master thread's affinity mask */
5242  __kmp_set_system_affinity(old_mask, TRUE);
5243  KMP_CPU_FREE(old_mask);
5244  }
5245 #endif
5246 #if KMP_NESTED_HOT_TEAMS
5247  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5248 #endif // KMP_NESTED_HOT_TEAMS
5249  /* make sure everyone is syncronized */
5250  int old_nproc = team->t.t_nproc; // save old value and use to update only
5251  // new threads below
5252  __kmp_initialize_team(team, new_nproc, new_icvs,
5253  root->r.r_uber_thread->th.th_ident);
5254 
5255  /* reinitialize the threads */
5256  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5257  for (f = 0; f < team->t.t_nproc; ++f)
5258  __kmp_initialize_info(team->t.t_threads[f], team, f,
5259  __kmp_gtid_from_tid(f, team));
5260 
5261  if (level) { // set th_task_state for new threads in nested hot team
5262  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5263  // only need to set the th_task_state for the new threads. th_task_state
5264  // for master thread will not be accurate until after this in
5265  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5266  // correct value.
5267  for (f = old_nproc; f < team->t.t_nproc; ++f)
5268  team->t.t_threads[f]->th.th_task_state =
5269  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5270  } else { // set th_task_state for new threads in non-nested hot team
5271  int old_state =
5272  team->t.t_threads[0]->th.th_task_state; // copy master's state
5273  for (f = old_nproc; f < team->t.t_nproc; ++f)
5274  team->t.t_threads[f]->th.th_task_state = old_state;
5275  }
5276 
5277 #ifdef KMP_DEBUG
5278  for (f = 0; f < team->t.t_nproc; ++f) {
5279  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5280  team->t.t_threads[f]->th.th_team_nproc ==
5281  team->t.t_nproc);
5282  }
5283 #endif
5284 
5285 #if OMP_40_ENABLED
5286  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5287 #if KMP_AFFINITY_SUPPORTED
5288  __kmp_partition_places(team);
5289 #endif
5290 #endif
5291  } // Check changes in number of threads
5292 
5293 #if OMP_40_ENABLED
5294  kmp_info_t *master = team->t.t_threads[0];
5295  if (master->th.th_teams_microtask) {
5296  for (f = 1; f < new_nproc; ++f) {
5297  // propagate teams construct specific info to workers
5298  kmp_info_t *thr = team->t.t_threads[f];
5299  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5300  thr->th.th_teams_level = master->th.th_teams_level;
5301  thr->th.th_teams_size = master->th.th_teams_size;
5302  }
5303  }
5304 #endif /* OMP_40_ENABLED */
5305 #if KMP_NESTED_HOT_TEAMS
5306  if (level) {
5307  // Sync barrier state for nested hot teams, not needed for outermost hot
5308  // team.
5309  for (f = 1; f < new_nproc; ++f) {
5310  kmp_info_t *thr = team->t.t_threads[f];
5311  int b;
5312  kmp_balign_t *balign = thr->th.th_bar;
5313  for (b = 0; b < bs_last_barrier; ++b) {
5314  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5315  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5316 #if USE_DEBUGGER
5317  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5318 #endif
5319  }
5320  }
5321  }
5322 #endif // KMP_NESTED_HOT_TEAMS
5323 
5324  /* reallocate space for arguments if necessary */
5325  __kmp_alloc_argv_entries(argc, team, TRUE);
5326  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5327  // The hot team re-uses the previous task team,
5328  // if untouched during the previous release->gather phase.
5329 
5330  KF_TRACE(10, (" hot_team = %p\n", team));
5331 
5332 #if KMP_DEBUG
5333  if (__kmp_tasking_mode != tskm_immediate_exec) {
5334  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5335  "task_team[1] = %p after reinit\n",
5336  team->t.t_task_team[0], team->t.t_task_team[1]));
5337  }
5338 #endif
5339 
5340 #if OMPT_SUPPORT
5341  __ompt_team_assign_id(team, ompt_parallel_data);
5342 #endif
5343 
5344  KMP_MB();
5345 
5346  return team;
5347  }
5348 
5349  /* next, let's try to take one from the team pool */
5350  KMP_MB();
5351  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5352  /* TODO: consider resizing undersized teams instead of reaping them, now
5353  that we have a resizing mechanism */
5354  if (team->t.t_max_nproc >= max_nproc) {
5355  /* take this team from the team pool */
5356  __kmp_team_pool = team->t.t_next_pool;
5357 
5358  /* setup the team for fresh use */
5359  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5360 
5361  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5362  "task_team[1] %p to NULL\n",
5363  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5364  team->t.t_task_team[0] = NULL;
5365  team->t.t_task_team[1] = NULL;
5366 
5367  /* reallocate space for arguments if necessary */
5368  __kmp_alloc_argv_entries(argc, team, TRUE);
5369  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5370 
5371  KA_TRACE(
5372  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5373  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5374  { // Initialize barrier data.
5375  int b;
5376  for (b = 0; b < bs_last_barrier; ++b) {
5377  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5378 #if USE_DEBUGGER
5379  team->t.t_bar[b].b_master_arrived = 0;
5380  team->t.t_bar[b].b_team_arrived = 0;
5381 #endif
5382  }
5383  }
5384 
5385 #if OMP_40_ENABLED
5386  team->t.t_proc_bind = new_proc_bind;
5387 #endif
5388 
5389  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5390  team->t.t_id));
5391 
5392 #if OMPT_SUPPORT
5393  __ompt_team_assign_id(team, ompt_parallel_data);
5394 #endif
5395 
5396  KMP_MB();
5397 
5398  return team;
5399  }
5400 
5401  /* reap team if it is too small, then loop back and check the next one */
5402  // not sure if this is wise, but, will be redone during the hot-teams
5403  // rewrite.
5404  /* TODO: Use technique to find the right size hot-team, don't reap them */
5405  team = __kmp_reap_team(team);
5406  __kmp_team_pool = team;
5407  }
5408 
5409  /* nothing available in the pool, no matter, make a new team! */
5410  KMP_MB();
5411  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5412 
5413  /* and set it up */
5414  team->t.t_max_nproc = max_nproc;
5415  /* NOTE well, for some reason allocating one big buffer and dividing it up
5416  seems to really hurt performance a lot on the P4, so, let's not use this */
5417  __kmp_allocate_team_arrays(team, max_nproc);
5418 
5419  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5420  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5421 
5422  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5423  "%p to NULL\n",
5424  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5425  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5426  // memory, no need to duplicate
5427  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5428  // memory, no need to duplicate
5429 
5430  if (__kmp_storage_map) {
5431  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5432  }
5433 
5434  /* allocate space for arguments */
5435  __kmp_alloc_argv_entries(argc, team, FALSE);
5436  team->t.t_argc = argc;
5437 
5438  KA_TRACE(20,
5439  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5440  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5441  { // Initialize barrier data.
5442  int b;
5443  for (b = 0; b < bs_last_barrier; ++b) {
5444  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5445 #if USE_DEBUGGER
5446  team->t.t_bar[b].b_master_arrived = 0;
5447  team->t.t_bar[b].b_team_arrived = 0;
5448 #endif
5449  }
5450  }
5451 
5452 #if OMP_40_ENABLED
5453  team->t.t_proc_bind = new_proc_bind;
5454 #endif
5455 
5456 #if OMPT_SUPPORT
5457  __ompt_team_assign_id(team, ompt_parallel_data);
5458  team->t.ompt_serialized_team_info = NULL;
5459 #endif
5460 
5461  KMP_MB();
5462 
5463  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5464  team->t.t_id));
5465 
5466  return team;
5467 }
5468 
5469 /* TODO implement hot-teams at all levels */
5470 /* TODO implement lazy thread release on demand (disband request) */
5471 
5472 /* free the team. return it to the team pool. release all the threads
5473  * associated with it */
5474 void __kmp_free_team(kmp_root_t *root,
5475  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5476  int f;
5477  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5478  team->t.t_id));
5479 
5480  /* verify state */
5481  KMP_DEBUG_ASSERT(root);
5482  KMP_DEBUG_ASSERT(team);
5483  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5484  KMP_DEBUG_ASSERT(team->t.t_threads);
5485 
5486  int use_hot_team = team == root->r.r_hot_team;
5487 #if KMP_NESTED_HOT_TEAMS
5488  int level;
5489  kmp_hot_team_ptr_t *hot_teams;
5490  if (master) {
5491  level = team->t.t_active_level - 1;
5492  if (master->th.th_teams_microtask) { // in teams construct?
5493  if (master->th.th_teams_size.nteams > 1) {
5494  ++level; // level was not increased in teams construct for
5495  // team_of_masters
5496  }
5497  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5498  master->th.th_teams_level == team->t.t_level) {
5499  ++level; // level was not increased in teams construct for
5500  // team_of_workers before the parallel
5501  } // team->t.t_level will be increased inside parallel
5502  }
5503  hot_teams = master->th.th_hot_teams;
5504  if (level < __kmp_hot_teams_max_level) {
5505  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5506  use_hot_team = 1;
5507  }
5508  }
5509 #endif // KMP_NESTED_HOT_TEAMS
5510 
5511  /* team is done working */
5512  TCW_SYNC_PTR(team->t.t_pkfn,
5513  NULL); // Important for Debugging Support Library.
5514 #if KMP_OS_WINDOWS
5515  team->t.t_copyin_counter = 0; // init counter for possible reuse
5516 #endif
5517  // Do not reset pointer to parent team to NULL for hot teams.
5518 
5519  /* if we are non-hot team, release our threads */
5520  if (!use_hot_team) {
5521  if (__kmp_tasking_mode != tskm_immediate_exec) {
5522  // Wait for threads to reach reapable state
5523  for (f = 1; f < team->t.t_nproc; ++f) {
5524  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5525  kmp_info_t *th = team->t.t_threads[f];
5526  volatile kmp_uint32 *state = &th->th.th_reap_state;
5527  while (*state != KMP_SAFE_TO_REAP) {
5528 #if KMP_OS_WINDOWS
5529  // On Windows a thread can be killed at any time, check this
5530  DWORD ecode;
5531  if (!__kmp_is_thread_alive(th, &ecode)) {
5532  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5533  break;
5534  }
5535 #endif
5536  // first check if thread is sleeping
5537  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5538  if (fl.is_sleeping())
5539  fl.resume(__kmp_gtid_from_thread(th));
5540  KMP_CPU_PAUSE();
5541  }
5542  }
5543 
5544  // Delete task teams
5545  int tt_idx;
5546  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5547  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5548  if (task_team != NULL) {
5549  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5550  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5551  team->t.t_threads[f]->th.th_task_team = NULL;
5552  }
5553  KA_TRACE(
5554  20,
5555  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5556  __kmp_get_gtid(), task_team, team->t.t_id));
5557 #if KMP_NESTED_HOT_TEAMS
5558  __kmp_free_task_team(master, task_team);
5559 #endif
5560  team->t.t_task_team[tt_idx] = NULL;
5561  }
5562  }
5563  }
5564 
5565  // Reset pointer to parent team only for non-hot teams.
5566  team->t.t_parent = NULL;
5567  team->t.t_level = 0;
5568  team->t.t_active_level = 0;
5569 
5570  /* free the worker threads */
5571  for (f = 1; f < team->t.t_nproc; ++f) {
5572  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5573  __kmp_free_thread(team->t.t_threads[f]);
5574  team->t.t_threads[f] = NULL;
5575  }
5576 
5577  /* put the team back in the team pool */
5578  /* TODO limit size of team pool, call reap_team if pool too large */
5579  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5580  __kmp_team_pool = (volatile kmp_team_t *)team;
5581  } else { // Check if team was created for the masters in a teams construct
5582  // See if first worker is a CG root
5583  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5584  team->t.t_threads[1]->th.th_cg_roots);
5585  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5586  // Clean up the CG root nodes on workers so that this team can be re-used
5587  for (f = 1; f < team->t.t_nproc; ++f) {
5588  kmp_info_t *thr = team->t.t_threads[f];
5589  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5590  thr->th.th_cg_roots->cg_root == thr);
5591  // Pop current CG root off list
5592  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5593  thr->th.th_cg_roots = tmp->up;
5594  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5595  " up to node %p. cg_nthreads was %d\n",
5596  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5597  __kmp_free(tmp);
5598  // Restore current task's thread_limit from CG root
5599  if (thr->th.th_cg_roots)
5600  thr->th.th_current_task->td_icvs.thread_limit =
5601  thr->th.th_cg_roots->cg_thread_limit;
5602  }
5603  }
5604  }
5605 
5606  KMP_MB();
5607 }
5608 
5609 /* reap the team. destroy it, reclaim all its resources and free its memory */
5610 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5611  kmp_team_t *next_pool = team->t.t_next_pool;
5612 
5613  KMP_DEBUG_ASSERT(team);
5614  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5615  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5616  KMP_DEBUG_ASSERT(team->t.t_threads);
5617  KMP_DEBUG_ASSERT(team->t.t_argv);
5618 
5619  /* TODO clean the threads that are a part of this? */
5620 
5621  /* free stuff */
5622  __kmp_free_team_arrays(team);
5623  if (team->t.t_argv != &team->t.t_inline_argv[0])
5624  __kmp_free((void *)team->t.t_argv);
5625  __kmp_free(team);
5626 
5627  KMP_MB();
5628  return next_pool;
5629 }
5630 
5631 // Free the thread. Don't reap it, just place it on the pool of available
5632 // threads.
5633 //
5634 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5635 // binding for the affinity mechanism to be useful.
5636 //
5637 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5638 // However, we want to avoid a potential performance problem by always
5639 // scanning through the list to find the correct point at which to insert
5640 // the thread (potential N**2 behavior). To do this we keep track of the
5641 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5642 // With single-level parallelism, threads will always be added to the tail
5643 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5644 // parallelism, all bets are off and we may need to scan through the entire
5645 // free list.
5646 //
5647 // This change also has a potentially large performance benefit, for some
5648 // applications. Previously, as threads were freed from the hot team, they
5649 // would be placed back on the free list in inverse order. If the hot team
5650 // grew back to it's original size, then the freed thread would be placed
5651 // back on the hot team in reverse order. This could cause bad cache
5652 // locality problems on programs where the size of the hot team regularly
5653 // grew and shrunk.
5654 //
5655 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5656 void __kmp_free_thread(kmp_info_t *this_th) {
5657  int gtid;
5658  kmp_info_t **scan;
5659 
5660  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5661  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5662 
5663  KMP_DEBUG_ASSERT(this_th);
5664 
5665  // When moving thread to pool, switch thread to wait on own b_go flag, and
5666  // uninitialized (NULL team).
5667  int b;
5668  kmp_balign_t *balign = this_th->th.th_bar;
5669  for (b = 0; b < bs_last_barrier; ++b) {
5670  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5671  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5672  balign[b].bb.team = NULL;
5673  balign[b].bb.leaf_kids = 0;
5674  }
5675  this_th->th.th_task_state = 0;
5676  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5677 
5678  /* put thread back on the free pool */
5679  TCW_PTR(this_th->th.th_team, NULL);
5680  TCW_PTR(this_th->th.th_root, NULL);
5681  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5682 
5683  while (this_th->th.th_cg_roots) {
5684  this_th->th.th_cg_roots->cg_nthreads--;
5685  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5686  " %p of thread %p to %d\n",
5687  this_th, this_th->th.th_cg_roots,
5688  this_th->th.th_cg_roots->cg_root,
5689  this_th->th.th_cg_roots->cg_nthreads));
5690  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5691  if (tmp->cg_root == this_th) { // Thread is a cg_root
5692  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5693  KA_TRACE(
5694  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5695  this_th->th.th_cg_roots = tmp->up;
5696  __kmp_free(tmp);
5697  } else { // Worker thread
5698  this_th->th.th_cg_roots = NULL;
5699  break;
5700  }
5701  }
5702 
5703  /* If the implicit task assigned to this thread can be used by other threads
5704  * -> multiple threads can share the data and try to free the task at
5705  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5706  * with higher probability when hot team is disabled but can occurs even when
5707  * the hot team is enabled */
5708  __kmp_free_implicit_task(this_th);
5709  this_th->th.th_current_task = NULL;
5710 
5711  // If the __kmp_thread_pool_insert_pt is already past the new insert
5712  // point, then we need to re-scan the entire list.
5713  gtid = this_th->th.th_info.ds.ds_gtid;
5714  if (__kmp_thread_pool_insert_pt != NULL) {
5715  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5716  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5717  __kmp_thread_pool_insert_pt = NULL;
5718  }
5719  }
5720 
5721  // Scan down the list to find the place to insert the thread.
5722  // scan is the address of a link in the list, possibly the address of
5723  // __kmp_thread_pool itself.
5724  //
5725  // In the absence of nested parallism, the for loop will have 0 iterations.
5726  if (__kmp_thread_pool_insert_pt != NULL) {
5727  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5728  } else {
5729  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5730  }
5731  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5732  scan = &((*scan)->th.th_next_pool))
5733  ;
5734 
5735  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5736  // to its address.
5737  TCW_PTR(this_th->th.th_next_pool, *scan);
5738  __kmp_thread_pool_insert_pt = *scan = this_th;
5739  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5740  (this_th->th.th_info.ds.ds_gtid <
5741  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5742  TCW_4(this_th->th.th_in_pool, TRUE);
5743  __kmp_suspend_initialize_thread(this_th);
5744  __kmp_lock_suspend_mx(this_th);
5745  if (this_th->th.th_active == TRUE) {
5746  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5747  this_th->th.th_active_in_pool = TRUE;
5748  }
5749 #if KMP_DEBUG
5750  else {
5751  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5752  }
5753 #endif
5754  __kmp_unlock_suspend_mx(this_th);
5755 
5756  TCW_4(__kmp_nth, __kmp_nth - 1);
5757 
5758 #ifdef KMP_ADJUST_BLOCKTIME
5759  /* Adjust blocktime back to user setting or default if necessary */
5760  /* Middle initialization might never have occurred */
5761  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5762  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5763  if (__kmp_nth <= __kmp_avail_proc) {
5764  __kmp_zero_bt = FALSE;
5765  }
5766  }
5767 #endif /* KMP_ADJUST_BLOCKTIME */
5768 
5769  KMP_MB();
5770 }
5771 
5772 /* ------------------------------------------------------------------------ */
5773 
5774 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5775  int gtid = this_thr->th.th_info.ds.ds_gtid;
5776  /* void *stack_data;*/
5777  kmp_team_t *(*volatile pteam);
5778 
5779  KMP_MB();
5780  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5781 
5782  if (__kmp_env_consistency_check) {
5783  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5784  }
5785 
5786 #if OMPT_SUPPORT
5787  ompt_data_t *thread_data;
5788  if (ompt_enabled.enabled) {
5789  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5790  *thread_data = ompt_data_none;
5791 
5792  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5793  this_thr->th.ompt_thread_info.wait_id = 0;
5794  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5795  if (ompt_enabled.ompt_callback_thread_begin) {
5796  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5797  ompt_thread_worker, thread_data);
5798  }
5799  }
5800 #endif
5801 
5802 #if OMPT_SUPPORT
5803  if (ompt_enabled.enabled) {
5804  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5805  }
5806 #endif
5807  /* This is the place where threads wait for work */
5808  while (!TCR_4(__kmp_global.g.g_done)) {
5809  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5810  KMP_MB();
5811 
5812  /* wait for work to do */
5813  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5814 
5815  /* No tid yet since not part of a team */
5816  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5817 
5818 #if OMPT_SUPPORT
5819  if (ompt_enabled.enabled) {
5820  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5821  }
5822 #endif
5823 
5824  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5825 
5826  /* have we been allocated? */
5827  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5828  /* we were just woken up, so run our new task */
5829  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5830  int rc;
5831  KA_TRACE(20,
5832  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5833  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5834  (*pteam)->t.t_pkfn));
5835 
5836  updateHWFPControl(*pteam);
5837 
5838 #if OMPT_SUPPORT
5839  if (ompt_enabled.enabled) {
5840  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5841  }
5842 #endif
5843 
5844  rc = (*pteam)->t.t_invoke(gtid);
5845  KMP_ASSERT(rc);
5846 
5847  KMP_MB();
5848  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5849  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5850  (*pteam)->t.t_pkfn));
5851  }
5852 #if OMPT_SUPPORT
5853  if (ompt_enabled.enabled) {
5854  /* no frame set while outside task */
5855  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5856 
5857  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5858  }
5859 #endif
5860  /* join barrier after parallel region */
5861  __kmp_join_barrier(gtid);
5862  }
5863  }
5864  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5865 
5866 #if OMPT_SUPPORT
5867  if (ompt_enabled.ompt_callback_thread_end) {
5868  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5869  }
5870 #endif
5871 
5872  this_thr->th.th_task_team = NULL;
5873  /* run the destructors for the threadprivate data for this thread */
5874  __kmp_common_destroy_gtid(gtid);
5875 
5876  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5877  KMP_MB();
5878  return this_thr;
5879 }
5880 
5881 /* ------------------------------------------------------------------------ */
5882 
5883 void __kmp_internal_end_dest(void *specific_gtid) {
5884 #if KMP_COMPILER_ICC
5885 #pragma warning(push)
5886 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5887 // significant bits
5888 #endif
5889  // Make sure no significant bits are lost
5890  int gtid = (kmp_intptr_t)specific_gtid - 1;
5891 #if KMP_COMPILER_ICC
5892 #pragma warning(pop)
5893 #endif
5894 
5895  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5896  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5897  * this is because 0 is reserved for the nothing-stored case */
5898 
5899  /* josh: One reason for setting the gtid specific data even when it is being
5900  destroyed by pthread is to allow gtid lookup through thread specific data
5901  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5902  that gets executed in the call to __kmp_internal_end_thread, actually
5903  gets the gtid through the thread specific data. Setting it here seems
5904  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5905  to run smoothly.
5906  todo: get rid of this after we remove the dependence on
5907  __kmp_gtid_get_specific */
5908  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5909  __kmp_gtid_set_specific(gtid);
5910 #ifdef KMP_TDATA_GTID
5911  __kmp_gtid = gtid;
5912 #endif
5913  __kmp_internal_end_thread(gtid);
5914 }
5915 
5916 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5917 
5918 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5919 // destructors work perfectly, but in real libomp.so I have no evidence it is
5920 // ever called. However, -fini linker option in makefile.mk works fine.
5921 
5922 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5923  __kmp_internal_end_atexit();
5924 }
5925 
5926 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5927 
5928 #endif
5929 
5930 /* [Windows] josh: when the atexit handler is called, there may still be more
5931  than one thread alive */
5932 void __kmp_internal_end_atexit(void) {
5933  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5934  /* [Windows]
5935  josh: ideally, we want to completely shutdown the library in this atexit
5936  handler, but stat code that depends on thread specific data for gtid fails
5937  because that data becomes unavailable at some point during the shutdown, so
5938  we call __kmp_internal_end_thread instead. We should eventually remove the
5939  dependency on __kmp_get_specific_gtid in the stat code and use
5940  __kmp_internal_end_library to cleanly shutdown the library.
5941 
5942  // TODO: Can some of this comment about GVS be removed?
5943  I suspect that the offending stat code is executed when the calling thread
5944  tries to clean up a dead root thread's data structures, resulting in GVS
5945  code trying to close the GVS structures for that thread, but since the stat
5946  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5947  the calling thread is cleaning up itself instead of another thread, it get
5948  confused. This happens because allowing a thread to unregister and cleanup
5949  another thread is a recent modification for addressing an issue.
5950  Based on the current design (20050722), a thread may end up
5951  trying to unregister another thread only if thread death does not trigger
5952  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5953  thread specific data destructor function to detect thread death. For
5954  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5955  is nothing. Thus, the workaround is applicable only for Windows static
5956  stat library. */
5957  __kmp_internal_end_library(-1);
5958 #if KMP_OS_WINDOWS
5959  __kmp_close_console();
5960 #endif
5961 }
5962 
5963 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5964  // It is assumed __kmp_forkjoin_lock is acquired.
5965 
5966  int gtid;
5967 
5968  KMP_DEBUG_ASSERT(thread != NULL);
5969 
5970  gtid = thread->th.th_info.ds.ds_gtid;
5971 
5972  if (!is_root) {
5973  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5974  /* Assume the threads are at the fork barrier here */
5975  KA_TRACE(
5976  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5977  gtid));
5978  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5979  * (GEH) */
5980  ANNOTATE_HAPPENS_BEFORE(thread);
5981  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5982  __kmp_release_64(&flag);
5983  }
5984 
5985  // Terminate OS thread.
5986  __kmp_reap_worker(thread);
5987 
5988  // The thread was killed asynchronously. If it was actively
5989  // spinning in the thread pool, decrement the global count.
5990  //
5991  // There is a small timing hole here - if the worker thread was just waking
5992  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5993  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5994  // the global counter might not get updated.
5995  //
5996  // Currently, this can only happen as the library is unloaded,
5997  // so there are no harmful side effects.
5998  if (thread->th.th_active_in_pool) {
5999  thread->th.th_active_in_pool = FALSE;
6000  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6001  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6002  }
6003  }
6004 
6005  __kmp_free_implicit_task(thread);
6006 
6007 // Free the fast memory for tasking
6008 #if USE_FAST_MEMORY
6009  __kmp_free_fast_memory(thread);
6010 #endif /* USE_FAST_MEMORY */
6011 
6012  __kmp_suspend_uninitialize_thread(thread);
6013 
6014  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6015  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6016 
6017  --__kmp_all_nth;
6018 // __kmp_nth was decremented when thread is added to the pool.
6019 
6020 #ifdef KMP_ADJUST_BLOCKTIME
6021  /* Adjust blocktime back to user setting or default if necessary */
6022  /* Middle initialization might never have occurred */
6023  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6024  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6025  if (__kmp_nth <= __kmp_avail_proc) {
6026  __kmp_zero_bt = FALSE;
6027  }
6028  }
6029 #endif /* KMP_ADJUST_BLOCKTIME */
6030 
6031  /* free the memory being used */
6032  if (__kmp_env_consistency_check) {
6033  if (thread->th.th_cons) {
6034  __kmp_free_cons_stack(thread->th.th_cons);
6035  thread->th.th_cons = NULL;
6036  }
6037  }
6038 
6039  if (thread->th.th_pri_common != NULL) {
6040  __kmp_free(thread->th.th_pri_common);
6041  thread->th.th_pri_common = NULL;
6042  }
6043 
6044  if (thread->th.th_task_state_memo_stack != NULL) {
6045  __kmp_free(thread->th.th_task_state_memo_stack);
6046  thread->th.th_task_state_memo_stack = NULL;
6047  }
6048 
6049 #if KMP_USE_BGET
6050  if (thread->th.th_local.bget_data != NULL) {
6051  __kmp_finalize_bget(thread);
6052  }
6053 #endif
6054 
6055 #if KMP_AFFINITY_SUPPORTED
6056  if (thread->th.th_affin_mask != NULL) {
6057  KMP_CPU_FREE(thread->th.th_affin_mask);
6058  thread->th.th_affin_mask = NULL;
6059  }
6060 #endif /* KMP_AFFINITY_SUPPORTED */
6061 
6062 #if KMP_USE_HIER_SCHED
6063  if (thread->th.th_hier_bar_data != NULL) {
6064  __kmp_free(thread->th.th_hier_bar_data);
6065  thread->th.th_hier_bar_data = NULL;
6066  }
6067 #endif
6068 
6069  __kmp_reap_team(thread->th.th_serial_team);
6070  thread->th.th_serial_team = NULL;
6071  __kmp_free(thread);
6072 
6073  KMP_MB();
6074 
6075 } // __kmp_reap_thread
6076 
6077 static void __kmp_internal_end(void) {
6078  int i;
6079 
6080  /* First, unregister the library */
6081  __kmp_unregister_library();
6082 
6083 #if KMP_OS_WINDOWS
6084  /* In Win static library, we can't tell when a root actually dies, so we
6085  reclaim the data structures for any root threads that have died but not
6086  unregistered themselves, in order to shut down cleanly.
6087  In Win dynamic library we also can't tell when a thread dies. */
6088  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6089 // dead roots
6090 #endif
6091 
6092  for (i = 0; i < __kmp_threads_capacity; i++)
6093  if (__kmp_root[i])
6094  if (__kmp_root[i]->r.r_active)
6095  break;
6096  KMP_MB(); /* Flush all pending memory write invalidates. */
6097  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6098 
6099  if (i < __kmp_threads_capacity) {
6100 #if KMP_USE_MONITOR
6101  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6102  KMP_MB(); /* Flush all pending memory write invalidates. */
6103 
6104  // Need to check that monitor was initialized before reaping it. If we are
6105  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6106  // __kmp_monitor will appear to contain valid data, but it is only valid in
6107  // the parent process, not the child.
6108  // New behavior (201008): instead of keying off of the flag
6109  // __kmp_init_parallel, the monitor thread creation is keyed off
6110  // of the new flag __kmp_init_monitor.
6111  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6112  if (TCR_4(__kmp_init_monitor)) {
6113  __kmp_reap_monitor(&__kmp_monitor);
6114  TCW_4(__kmp_init_monitor, 0);
6115  }
6116  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6117  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6118 #endif // KMP_USE_MONITOR
6119  } else {
6120 /* TODO move this to cleanup code */
6121 #ifdef KMP_DEBUG
6122  /* make sure that everything has properly ended */
6123  for (i = 0; i < __kmp_threads_capacity; i++) {
6124  if (__kmp_root[i]) {
6125  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6126  // there can be uber threads alive here
6127  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6128  }
6129  }
6130 #endif
6131 
6132  KMP_MB();
6133 
6134  // Reap the worker threads.
6135  // This is valid for now, but be careful if threads are reaped sooner.
6136  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6137  // Get the next thread from the pool.
6138  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6139  __kmp_thread_pool = thread->th.th_next_pool;
6140  // Reap it.
6141  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6142  thread->th.th_next_pool = NULL;
6143  thread->th.th_in_pool = FALSE;
6144  __kmp_reap_thread(thread, 0);
6145  }
6146  __kmp_thread_pool_insert_pt = NULL;
6147 
6148  // Reap teams.
6149  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6150  // Get the next team from the pool.
6151  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6152  __kmp_team_pool = team->t.t_next_pool;
6153  // Reap it.
6154  team->t.t_next_pool = NULL;
6155  __kmp_reap_team(team);
6156  }
6157 
6158  __kmp_reap_task_teams();
6159 
6160 #if KMP_OS_UNIX
6161  // Threads that are not reaped should not access any resources since they
6162  // are going to be deallocated soon, so the shutdown sequence should wait
6163  // until all threads either exit the final spin-waiting loop or begin
6164  // sleeping after the given blocktime.
6165  for (i = 0; i < __kmp_threads_capacity; i++) {
6166  kmp_info_t *thr = __kmp_threads[i];
6167  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6168  KMP_CPU_PAUSE();
6169  }
6170 #endif
6171 
6172  for (i = 0; i < __kmp_threads_capacity; ++i) {
6173  // TBD: Add some checking...
6174  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6175  }
6176 
6177  /* Make sure all threadprivate destructors get run by joining with all
6178  worker threads before resetting this flag */
6179  TCW_SYNC_4(__kmp_init_common, FALSE);
6180 
6181  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6182  KMP_MB();
6183 
6184 #if KMP_USE_MONITOR
6185  // See note above: One of the possible fixes for CQ138434 / CQ140126
6186  //
6187  // FIXME: push both code fragments down and CSE them?
6188  // push them into __kmp_cleanup() ?
6189  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6190  if (TCR_4(__kmp_init_monitor)) {
6191  __kmp_reap_monitor(&__kmp_monitor);
6192  TCW_4(__kmp_init_monitor, 0);
6193  }
6194  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6195  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6196 #endif
6197  } /* else !__kmp_global.t_active */
6198  TCW_4(__kmp_init_gtid, FALSE);
6199  KMP_MB(); /* Flush all pending memory write invalidates. */
6200 
6201  __kmp_cleanup();
6202 #if OMPT_SUPPORT
6203  ompt_fini();
6204 #endif
6205 }
6206 
6207 void __kmp_internal_end_library(int gtid_req) {
6208  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6209  /* this shouldn't be a race condition because __kmp_internal_end() is the
6210  only place to clear __kmp_serial_init */
6211  /* we'll check this later too, after we get the lock */
6212  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6213  // redundaant, because the next check will work in any case.
6214  if (__kmp_global.g.g_abort) {
6215  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6216  /* TODO abort? */
6217  return;
6218  }
6219  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6220  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6221  return;
6222  }
6223 
6224  KMP_MB(); /* Flush all pending memory write invalidates. */
6225 
6226  /* find out who we are and what we should do */
6227  {
6228  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6229  KA_TRACE(
6230  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6231  if (gtid == KMP_GTID_SHUTDOWN) {
6232  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6233  "already shutdown\n"));
6234  return;
6235  } else if (gtid == KMP_GTID_MONITOR) {
6236  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6237  "registered, or system shutdown\n"));
6238  return;
6239  } else if (gtid == KMP_GTID_DNE) {
6240  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6241  "shutdown\n"));
6242  /* we don't know who we are, but we may still shutdown the library */
6243  } else if (KMP_UBER_GTID(gtid)) {
6244  /* unregister ourselves as an uber thread. gtid is no longer valid */
6245  if (__kmp_root[gtid]->r.r_active) {
6246  __kmp_global.g.g_abort = -1;
6247  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6248  KA_TRACE(10,
6249  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6250  gtid));
6251  return;
6252  } else {
6253  KA_TRACE(
6254  10,
6255  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6256  __kmp_unregister_root_current_thread(gtid);
6257  }
6258  } else {
6259 /* worker threads may call this function through the atexit handler, if they
6260  * call exit() */
6261 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6262  TODO: do a thorough shutdown instead */
6263 #ifdef DUMP_DEBUG_ON_EXIT
6264  if (__kmp_debug_buf)
6265  __kmp_dump_debug_buffer();
6266 #endif
6267  return;
6268  }
6269  }
6270  /* synchronize the termination process */
6271  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6272 
6273  /* have we already finished */
6274  if (__kmp_global.g.g_abort) {
6275  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6276  /* TODO abort? */
6277  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6278  return;
6279  }
6280  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6281  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6282  return;
6283  }
6284 
6285  /* We need this lock to enforce mutex between this reading of
6286  __kmp_threads_capacity and the writing by __kmp_register_root.
6287  Alternatively, we can use a counter of roots that is atomically updated by
6288  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6289  __kmp_internal_end_*. */
6290  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6291 
6292  /* now we can safely conduct the actual termination */
6293  __kmp_internal_end();
6294 
6295  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6296  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6297 
6298  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6299 
6300 #ifdef DUMP_DEBUG_ON_EXIT
6301  if (__kmp_debug_buf)
6302  __kmp_dump_debug_buffer();
6303 #endif
6304 
6305 #if KMP_OS_WINDOWS
6306  __kmp_close_console();
6307 #endif
6308 
6309  __kmp_fini_allocator();
6310 
6311 } // __kmp_internal_end_library
6312 
6313 void __kmp_internal_end_thread(int gtid_req) {
6314  int i;
6315 
6316  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6317  /* this shouldn't be a race condition because __kmp_internal_end() is the
6318  * only place to clear __kmp_serial_init */
6319  /* we'll check this later too, after we get the lock */
6320  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6321  // redundant, because the next check will work in any case.
6322  if (__kmp_global.g.g_abort) {
6323  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6324  /* TODO abort? */
6325  return;
6326  }
6327  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6328  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6329  return;
6330  }
6331 
6332  KMP_MB(); /* Flush all pending memory write invalidates. */
6333 
6334  /* find out who we are and what we should do */
6335  {
6336  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6337  KA_TRACE(10,
6338  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6339  if (gtid == KMP_GTID_SHUTDOWN) {
6340  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6341  "already shutdown\n"));
6342  return;
6343  } else if (gtid == KMP_GTID_MONITOR) {
6344  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6345  "registered, or system shutdown\n"));
6346  return;
6347  } else if (gtid == KMP_GTID_DNE) {
6348  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6349  "shutdown\n"));
6350  return;
6351  /* we don't know who we are */
6352  } else if (KMP_UBER_GTID(gtid)) {
6353  /* unregister ourselves as an uber thread. gtid is no longer valid */
6354  if (__kmp_root[gtid]->r.r_active) {
6355  __kmp_global.g.g_abort = -1;
6356  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6357  KA_TRACE(10,
6358  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6359  gtid));
6360  return;
6361  } else {
6362  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6363  gtid));
6364  __kmp_unregister_root_current_thread(gtid);
6365  }
6366  } else {
6367  /* just a worker thread, let's leave */
6368  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6369 
6370  if (gtid >= 0) {
6371  __kmp_threads[gtid]->th.th_task_team = NULL;
6372  }
6373 
6374  KA_TRACE(10,
6375  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6376  gtid));
6377  return;
6378  }
6379  }
6380 #if KMP_DYNAMIC_LIB
6381 #if OMP_50_ENABLED
6382  if (__kmp_pause_status != kmp_hard_paused)
6383 #endif
6384  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6385  // because we will better shutdown later in the library destructor.
6386  {
6387  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6388  return;
6389  }
6390 #endif
6391  /* synchronize the termination process */
6392  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6393 
6394  /* have we already finished */
6395  if (__kmp_global.g.g_abort) {
6396  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6397  /* TODO abort? */
6398  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6399  return;
6400  }
6401  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6402  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6403  return;
6404  }
6405 
6406  /* We need this lock to enforce mutex between this reading of
6407  __kmp_threads_capacity and the writing by __kmp_register_root.
6408  Alternatively, we can use a counter of roots that is atomically updated by
6409  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6410  __kmp_internal_end_*. */
6411 
6412  /* should we finish the run-time? are all siblings done? */
6413  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6414 
6415  for (i = 0; i < __kmp_threads_capacity; ++i) {
6416  if (KMP_UBER_GTID(i)) {
6417  KA_TRACE(
6418  10,
6419  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6420  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6421  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6422  return;
6423  }
6424  }
6425 
6426  /* now we can safely conduct the actual termination */
6427 
6428  __kmp_internal_end();
6429 
6430  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6431  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6432 
6433  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6434 
6435 #ifdef DUMP_DEBUG_ON_EXIT
6436  if (__kmp_debug_buf)
6437  __kmp_dump_debug_buffer();
6438 #endif
6439 } // __kmp_internal_end_thread
6440 
6441 // -----------------------------------------------------------------------------
6442 // Library registration stuff.
6443 
6444 static long __kmp_registration_flag = 0;
6445 // Random value used to indicate library initialization.
6446 static char *__kmp_registration_str = NULL;
6447 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6448 
6449 static inline char *__kmp_reg_status_name() {
6450  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6451  each thread. If registration and unregistration go in different threads
6452  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6453  env var can not be found, because the name will contain different pid. */
6454  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6455 } // __kmp_reg_status_get
6456 
6457 void __kmp_register_library_startup(void) {
6458 
6459  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6460  int done = 0;
6461  union {
6462  double dtime;
6463  long ltime;
6464  } time;
6465 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6466  __kmp_initialize_system_tick();
6467 #endif
6468  __kmp_read_system_time(&time.dtime);
6469  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6470  __kmp_registration_str =
6471  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6472  __kmp_registration_flag, KMP_LIBRARY_FILE);
6473 
6474  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6475  __kmp_registration_str));
6476 
6477  while (!done) {
6478 
6479  char *value = NULL; // Actual value of the environment variable.
6480 
6481  // Set environment variable, but do not overwrite if it is exist.
6482  __kmp_env_set(name, __kmp_registration_str, 0);
6483  // Check the variable is written.
6484  value = __kmp_env_get(name);
6485  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6486 
6487  done = 1; // Ok, environment variable set successfully, exit the loop.
6488 
6489  } else {
6490 
6491  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6492  // Check whether it alive or dead.
6493  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6494  char *tail = value;
6495  char *flag_addr_str = NULL;
6496  char *flag_val_str = NULL;
6497  char const *file_name = NULL;
6498  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6499  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6500  file_name = tail;
6501  if (tail != NULL) {
6502  long *flag_addr = 0;
6503  long flag_val = 0;
6504  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6505  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6506  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6507  // First, check whether environment-encoded address is mapped into
6508  // addr space.
6509  // If so, dereference it to see if it still has the right value.
6510  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6511  neighbor = 1;
6512  } else {
6513  // If not, then we know the other copy of the library is no longer
6514  // running.
6515  neighbor = 2;
6516  }
6517  }
6518  }
6519  switch (neighbor) {
6520  case 0: // Cannot parse environment variable -- neighbor status unknown.
6521  // Assume it is the incompatible format of future version of the
6522  // library. Assume the other library is alive.
6523  // WARN( ... ); // TODO: Issue a warning.
6524  file_name = "unknown library";
6525  KMP_FALLTHROUGH();
6526  // Attention! Falling to the next case. That's intentional.
6527  case 1: { // Neighbor is alive.
6528  // Check it is allowed.
6529  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6530  if (!__kmp_str_match_true(duplicate_ok)) {
6531  // That's not allowed. Issue fatal error.
6532  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6533  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6534  }
6535  KMP_INTERNAL_FREE(duplicate_ok);
6536  __kmp_duplicate_library_ok = 1;
6537  done = 1; // Exit the loop.
6538  } break;
6539  case 2: { // Neighbor is dead.
6540  // Clear the variable and try to register library again.
6541  __kmp_env_unset(name);
6542  } break;
6543  default: { KMP_DEBUG_ASSERT(0); } break;
6544  }
6545  }
6546  KMP_INTERNAL_FREE((void *)value);
6547  }
6548  KMP_INTERNAL_FREE((void *)name);
6549 
6550 } // func __kmp_register_library_startup
6551 
6552 void __kmp_unregister_library(void) {
6553 
6554  char *name = __kmp_reg_status_name();
6555  char *value = __kmp_env_get(name);
6556 
6557  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6558  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6559  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6560  // Ok, this is our variable. Delete it.
6561  __kmp_env_unset(name);
6562  }
6563 
6564  KMP_INTERNAL_FREE(__kmp_registration_str);
6565  KMP_INTERNAL_FREE(value);
6566  KMP_INTERNAL_FREE(name);
6567 
6568  __kmp_registration_flag = 0;
6569  __kmp_registration_str = NULL;
6570 
6571 } // __kmp_unregister_library
6572 
6573 // End of Library registration stuff.
6574 // -----------------------------------------------------------------------------
6575 
6576 #if KMP_MIC_SUPPORTED
6577 
6578 static void __kmp_check_mic_type() {
6579  kmp_cpuid_t cpuid_state = {0};
6580  kmp_cpuid_t *cs_p = &cpuid_state;
6581  __kmp_x86_cpuid(1, 0, cs_p);
6582  // We don't support mic1 at the moment
6583  if ((cs_p->eax & 0xff0) == 0xB10) {
6584  __kmp_mic_type = mic2;
6585  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6586  __kmp_mic_type = mic3;
6587  } else {
6588  __kmp_mic_type = non_mic;
6589  }
6590 }
6591 
6592 #endif /* KMP_MIC_SUPPORTED */
6593 
6594 static void __kmp_do_serial_initialize(void) {
6595  int i, gtid;
6596  int size;
6597 
6598  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6599 
6600  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6601  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6602  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6603  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6604  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6605 
6606 #if OMPT_SUPPORT
6607  ompt_pre_init();
6608 #endif
6609 
6610  __kmp_validate_locks();
6611 
6612  /* Initialize internal memory allocator */
6613  __kmp_init_allocator();
6614 
6615  /* Register the library startup via an environment variable and check to see
6616  whether another copy of the library is already registered. */
6617 
6618  __kmp_register_library_startup();
6619 
6620  /* TODO reinitialization of library */
6621  if (TCR_4(__kmp_global.g.g_done)) {
6622  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6623  }
6624 
6625  __kmp_global.g.g_abort = 0;
6626  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6627 
6628 /* initialize the locks */
6629 #if KMP_USE_ADAPTIVE_LOCKS
6630 #if KMP_DEBUG_ADAPTIVE_LOCKS
6631  __kmp_init_speculative_stats();
6632 #endif
6633 #endif
6634 #if KMP_STATS_ENABLED
6635  __kmp_stats_init();
6636 #endif
6637  __kmp_init_lock(&__kmp_global_lock);
6638  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6639  __kmp_init_lock(&__kmp_debug_lock);
6640  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6641  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6642  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6643  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6644  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6645  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6646  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6647  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6648  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6649  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6650  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6651  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6652  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6653  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6654  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6655 #if KMP_USE_MONITOR
6656  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6657 #endif
6658  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6659 
6660  /* conduct initialization and initial setup of configuration */
6661 
6662  __kmp_runtime_initialize();
6663 
6664 #if KMP_MIC_SUPPORTED
6665  __kmp_check_mic_type();
6666 #endif
6667 
6668 // Some global variable initialization moved here from kmp_env_initialize()
6669 #ifdef KMP_DEBUG
6670  kmp_diag = 0;
6671 #endif
6672  __kmp_abort_delay = 0;
6673 
6674  // From __kmp_init_dflt_team_nth()
6675  /* assume the entire machine will be used */
6676  __kmp_dflt_team_nth_ub = __kmp_xproc;
6677  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6678  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6679  }
6680  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6681  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6682  }
6683  __kmp_max_nth = __kmp_sys_max_nth;
6684  __kmp_cg_max_nth = __kmp_sys_max_nth;
6685  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6686  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6687  __kmp_teams_max_nth = __kmp_sys_max_nth;
6688  }
6689 
6690  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6691  // part
6692  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6693 #if KMP_USE_MONITOR
6694  __kmp_monitor_wakeups =
6695  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6696  __kmp_bt_intervals =
6697  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6698 #endif
6699  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6700  __kmp_library = library_throughput;
6701  // From KMP_SCHEDULE initialization
6702  __kmp_static = kmp_sch_static_balanced;
6703 // AC: do not use analytical here, because it is non-monotonous
6704 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6705 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6706 // need to repeat assignment
6707 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6708 // bit control and barrier method control parts
6709 #if KMP_FAST_REDUCTION_BARRIER
6710 #define kmp_reduction_barrier_gather_bb ((int)1)
6711 #define kmp_reduction_barrier_release_bb ((int)1)
6712 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6713 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6714 #endif // KMP_FAST_REDUCTION_BARRIER
6715  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6716  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6717  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6718  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6719  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6720 #if KMP_FAST_REDUCTION_BARRIER
6721  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6722  // lin_64 ): hyper,1
6723  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6724  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6725  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6726  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6727  }
6728 #endif // KMP_FAST_REDUCTION_BARRIER
6729  }
6730 #if KMP_FAST_REDUCTION_BARRIER
6731 #undef kmp_reduction_barrier_release_pat
6732 #undef kmp_reduction_barrier_gather_pat
6733 #undef kmp_reduction_barrier_release_bb
6734 #undef kmp_reduction_barrier_gather_bb
6735 #endif // KMP_FAST_REDUCTION_BARRIER
6736 #if KMP_MIC_SUPPORTED
6737  if (__kmp_mic_type == mic2) { // KNC
6738  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6739  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6740  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6741  1; // forkjoin release
6742  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6743  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6744  }
6745 #if KMP_FAST_REDUCTION_BARRIER
6746  if (__kmp_mic_type == mic2) { // KNC
6747  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6748  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6749  }
6750 #endif // KMP_FAST_REDUCTION_BARRIER
6751 #endif // KMP_MIC_SUPPORTED
6752 
6753 // From KMP_CHECKS initialization
6754 #ifdef KMP_DEBUG
6755  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6756 #else
6757  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6758 #endif
6759 
6760  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6761  __kmp_foreign_tp = TRUE;
6762 
6763  __kmp_global.g.g_dynamic = FALSE;
6764  __kmp_global.g.g_dynamic_mode = dynamic_default;
6765 
6766  __kmp_env_initialize(NULL);
6767 
6768 // Print all messages in message catalog for testing purposes.
6769 #ifdef KMP_DEBUG
6770  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6771  if (__kmp_str_match_true(val)) {
6772  kmp_str_buf_t buffer;
6773  __kmp_str_buf_init(&buffer);
6774  __kmp_i18n_dump_catalog(&buffer);
6775  __kmp_printf("%s", buffer.str);
6776  __kmp_str_buf_free(&buffer);
6777  }
6778  __kmp_env_free(&val);
6779 #endif
6780 
6781  __kmp_threads_capacity =
6782  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6783  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6784  __kmp_tp_capacity = __kmp_default_tp_capacity(
6785  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6786 
6787  // If the library is shut down properly, both pools must be NULL. Just in
6788  // case, set them to NULL -- some memory may leak, but subsequent code will
6789  // work even if pools are not freed.
6790  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6791  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6792  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6793  __kmp_thread_pool = NULL;
6794  __kmp_thread_pool_insert_pt = NULL;
6795  __kmp_team_pool = NULL;
6796 
6797  /* Allocate all of the variable sized records */
6798  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6799  * expandable */
6800  /* Since allocation is cache-aligned, just add extra padding at the end */
6801  size =
6802  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6803  CACHE_LINE;
6804  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6805  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6806  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6807 
6808  /* init thread counts */
6809  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6810  0); // Asserts fail if the library is reinitializing and
6811  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6812  __kmp_all_nth = 0;
6813  __kmp_nth = 0;
6814 
6815  /* setup the uber master thread and hierarchy */
6816  gtid = __kmp_register_root(TRUE);
6817  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6818  KMP_ASSERT(KMP_UBER_GTID(gtid));
6819  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6820 
6821  KMP_MB(); /* Flush all pending memory write invalidates. */
6822 
6823  __kmp_common_initialize();
6824 
6825 #if KMP_OS_UNIX
6826  /* invoke the child fork handler */
6827  __kmp_register_atfork();
6828 #endif
6829 
6830 #if !KMP_DYNAMIC_LIB
6831  {
6832  /* Invoke the exit handler when the program finishes, only for static
6833  library. For dynamic library, we already have _fini and DllMain. */
6834  int rc = atexit(__kmp_internal_end_atexit);
6835  if (rc != 0) {
6836  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6837  __kmp_msg_null);
6838  }
6839  }
6840 #endif
6841 
6842 #if KMP_HANDLE_SIGNALS
6843 #if KMP_OS_UNIX
6844  /* NOTE: make sure that this is called before the user installs their own
6845  signal handlers so that the user handlers are called first. this way they
6846  can return false, not call our handler, avoid terminating the library, and
6847  continue execution where they left off. */
6848  __kmp_install_signals(FALSE);
6849 #endif /* KMP_OS_UNIX */
6850 #if KMP_OS_WINDOWS
6851  __kmp_install_signals(TRUE);
6852 #endif /* KMP_OS_WINDOWS */
6853 #endif
6854 
6855  /* we have finished the serial initialization */
6856  __kmp_init_counter++;
6857 
6858  __kmp_init_serial = TRUE;
6859 
6860  if (__kmp_settings) {
6861  __kmp_env_print();
6862  }
6863 
6864 #if OMP_40_ENABLED
6865  if (__kmp_display_env || __kmp_display_env_verbose) {
6866  __kmp_env_print_2();
6867  }
6868 #endif // OMP_40_ENABLED
6869 
6870 #if OMPT_SUPPORT
6871  ompt_post_init();
6872 #endif
6873 
6874  KMP_MB();
6875 
6876  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6877 }
6878 
6879 void __kmp_serial_initialize(void) {
6880  if (__kmp_init_serial) {
6881  return;
6882  }
6883  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6884  if (__kmp_init_serial) {
6885  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6886  return;
6887  }
6888  __kmp_do_serial_initialize();
6889  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6890 }
6891 
6892 static void __kmp_do_middle_initialize(void) {
6893  int i, j;
6894  int prev_dflt_team_nth;
6895 
6896  if (!__kmp_init_serial) {
6897  __kmp_do_serial_initialize();
6898  }
6899 
6900  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6901 
6902  // Save the previous value for the __kmp_dflt_team_nth so that
6903  // we can avoid some reinitialization if it hasn't changed.
6904  prev_dflt_team_nth = __kmp_dflt_team_nth;
6905 
6906 #if KMP_AFFINITY_SUPPORTED
6907  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6908  // number of cores on the machine.
6909  __kmp_affinity_initialize();
6910 
6911  // Run through the __kmp_threads array and set the affinity mask
6912  // for each root thread that is currently registered with the RTL.
6913  for (i = 0; i < __kmp_threads_capacity; i++) {
6914  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6915  __kmp_affinity_set_init_mask(i, TRUE);
6916  }
6917  }
6918 #endif /* KMP_AFFINITY_SUPPORTED */
6919 
6920  KMP_ASSERT(__kmp_xproc > 0);
6921  if (__kmp_avail_proc == 0) {
6922  __kmp_avail_proc = __kmp_xproc;
6923  }
6924 
6925  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6926  // correct them now
6927  j = 0;
6928  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6929  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6930  __kmp_avail_proc;
6931  j++;
6932  }
6933 
6934  if (__kmp_dflt_team_nth == 0) {
6935 #ifdef KMP_DFLT_NTH_CORES
6936  // Default #threads = #cores
6937  __kmp_dflt_team_nth = __kmp_ncores;
6938  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6939  "__kmp_ncores (%d)\n",
6940  __kmp_dflt_team_nth));
6941 #else
6942  // Default #threads = #available OS procs
6943  __kmp_dflt_team_nth = __kmp_avail_proc;
6944  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6945  "__kmp_avail_proc(%d)\n",
6946  __kmp_dflt_team_nth));
6947 #endif /* KMP_DFLT_NTH_CORES */
6948  }
6949 
6950  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6951  __kmp_dflt_team_nth = KMP_MIN_NTH;
6952  }
6953  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6954  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6955  }
6956 
6957  // There's no harm in continuing if the following check fails,
6958  // but it indicates an error in the previous logic.
6959  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6960 
6961  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6962  // Run through the __kmp_threads array and set the num threads icv for each
6963  // root thread that is currently registered with the RTL (which has not
6964  // already explicitly set its nthreads-var with a call to
6965  // omp_set_num_threads()).
6966  for (i = 0; i < __kmp_threads_capacity; i++) {
6967  kmp_info_t *thread = __kmp_threads[i];
6968  if (thread == NULL)
6969  continue;
6970  if (thread->th.th_current_task->td_icvs.nproc != 0)
6971  continue;
6972 
6973  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6974  }
6975  }
6976  KA_TRACE(
6977  20,
6978  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6979  __kmp_dflt_team_nth));
6980 
6981 #ifdef KMP_ADJUST_BLOCKTIME
6982  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6983  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6984  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6985  if (__kmp_nth > __kmp_avail_proc) {
6986  __kmp_zero_bt = TRUE;
6987  }
6988  }
6989 #endif /* KMP_ADJUST_BLOCKTIME */
6990 
6991  /* we have finished middle initialization */
6992  TCW_SYNC_4(__kmp_init_middle, TRUE);
6993 
6994  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6995 }
6996 
6997 void __kmp_middle_initialize(void) {
6998  if (__kmp_init_middle) {
6999  return;
7000  }
7001  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7002  if (__kmp_init_middle) {
7003  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7004  return;
7005  }
7006  __kmp_do_middle_initialize();
7007  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7008 }
7009 
7010 void __kmp_parallel_initialize(void) {
7011  int gtid = __kmp_entry_gtid(); // this might be a new root
7012 
7013  /* synchronize parallel initialization (for sibling) */
7014  if (TCR_4(__kmp_init_parallel))
7015  return;
7016  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7017  if (TCR_4(__kmp_init_parallel)) {
7018  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7019  return;
7020  }
7021 
7022  /* TODO reinitialization after we have already shut down */
7023  if (TCR_4(__kmp_global.g.g_done)) {
7024  KA_TRACE(
7025  10,
7026  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7027  __kmp_infinite_loop();
7028  }
7029 
7030  /* jc: The lock __kmp_initz_lock is already held, so calling
7031  __kmp_serial_initialize would cause a deadlock. So we call
7032  __kmp_do_serial_initialize directly. */
7033  if (!__kmp_init_middle) {
7034  __kmp_do_middle_initialize();
7035  }
7036 
7037 #if OMP_50_ENABLED
7038  __kmp_resume_if_hard_paused();
7039 #endif
7040 
7041  /* begin initialization */
7042  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7043  KMP_ASSERT(KMP_UBER_GTID(gtid));
7044 
7045 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7046  // Save the FP control regs.
7047  // Worker threads will set theirs to these values at thread startup.
7048  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7049  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7050  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7051 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7052 
7053 #if KMP_OS_UNIX
7054 #if KMP_HANDLE_SIGNALS
7055  /* must be after __kmp_serial_initialize */
7056  __kmp_install_signals(TRUE);
7057 #endif
7058 #endif
7059 
7060  __kmp_suspend_initialize();
7061 
7062 #if defined(USE_LOAD_BALANCE)
7063  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7064  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7065  }
7066 #else
7067  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7068  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7069  }
7070 #endif
7071 
7072  if (__kmp_version) {
7073  __kmp_print_version_2();
7074  }
7075 
7076  /* we have finished parallel initialization */
7077  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7078 
7079  KMP_MB();
7080  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7081 
7082  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7083 }
7084 
7085 /* ------------------------------------------------------------------------ */
7086 
7087 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7088  kmp_team_t *team) {
7089  kmp_disp_t *dispatch;
7090 
7091  KMP_MB();
7092 
7093  /* none of the threads have encountered any constructs, yet. */
7094  this_thr->th.th_local.this_construct = 0;
7095 #if KMP_CACHE_MANAGE
7096  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7097 #endif /* KMP_CACHE_MANAGE */
7098  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7099  KMP_DEBUG_ASSERT(dispatch);
7100  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7101  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7102  // this_thr->th.th_info.ds.ds_tid ] );
7103 
7104  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7105 #if OMP_45_ENABLED
7106  dispatch->th_doacross_buf_idx =
7107  0; /* reset the doacross dispatch buffer counter */
7108 #endif
7109  if (__kmp_env_consistency_check)
7110  __kmp_push_parallel(gtid, team->t.t_ident);
7111 
7112  KMP_MB(); /* Flush all pending memory write invalidates. */
7113 }
7114 
7115 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7116  kmp_team_t *team) {
7117  if (__kmp_env_consistency_check)
7118  __kmp_pop_parallel(gtid, team->t.t_ident);
7119 
7120  __kmp_finish_implicit_task(this_thr);
7121 }
7122 
7123 int __kmp_invoke_task_func(int gtid) {
7124  int rc;
7125  int tid = __kmp_tid_from_gtid(gtid);
7126  kmp_info_t *this_thr = __kmp_threads[gtid];
7127  kmp_team_t *team = this_thr->th.th_team;
7128 
7129  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7130 #if USE_ITT_BUILD
7131  if (__itt_stack_caller_create_ptr) {
7132  __kmp_itt_stack_callee_enter(
7133  (__itt_caller)
7134  team->t.t_stack_id); // inform ittnotify about entering user's code
7135  }
7136 #endif /* USE_ITT_BUILD */
7137 #if INCLUDE_SSC_MARKS
7138  SSC_MARK_INVOKING();
7139 #endif
7140 
7141 #if OMPT_SUPPORT
7142  void *dummy;
7143  void **exit_runtime_p;
7144  ompt_data_t *my_task_data;
7145  ompt_data_t *my_parallel_data;
7146  int ompt_team_size;
7147 
7148  if (ompt_enabled.enabled) {
7149  exit_runtime_p = &(
7150  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7151  } else {
7152  exit_runtime_p = &dummy;
7153  }
7154 
7155  my_task_data =
7156  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7157  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7158  if (ompt_enabled.ompt_callback_implicit_task) {
7159  ompt_team_size = team->t.t_nproc;
7160  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7161  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7162  __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7163  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7164  }
7165 #endif
7166 
7167 #if KMP_STATS_ENABLED
7168  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7169  if (previous_state == stats_state_e::TEAMS_REGION) {
7170  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7171  } else {
7172  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7173  }
7174  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7175 #endif
7176 
7177  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7178  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7179 #if OMPT_SUPPORT
7180  ,
7181  exit_runtime_p
7182 #endif
7183  );
7184 #if OMPT_SUPPORT
7185  *exit_runtime_p = NULL;
7186 #endif
7187 
7188 #if KMP_STATS_ENABLED
7189  if (previous_state == stats_state_e::TEAMS_REGION) {
7190  KMP_SET_THREAD_STATE(previous_state);
7191  }
7192  KMP_POP_PARTITIONED_TIMER();
7193 #endif
7194 
7195 #if USE_ITT_BUILD
7196  if (__itt_stack_caller_create_ptr) {
7197  __kmp_itt_stack_callee_leave(
7198  (__itt_caller)
7199  team->t.t_stack_id); // inform ittnotify about leaving user's code
7200  }
7201 #endif /* USE_ITT_BUILD */
7202  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7203 
7204  return rc;
7205 }
7206 
7207 #if OMP_40_ENABLED
7208 void __kmp_teams_master(int gtid) {
7209  // This routine is called by all master threads in teams construct
7210  kmp_info_t *thr = __kmp_threads[gtid];
7211  kmp_team_t *team = thr->th.th_team;
7212  ident_t *loc = team->t.t_ident;
7213  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7214  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7215  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7216  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7217  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7218 
7219  // This thread is a new CG root. Set up the proper variables.
7220  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7221  tmp->cg_root = thr; // Make thr the CG root
7222  // Init to thread limit that was stored when league masters were forked
7223  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7224  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7225  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7226  " cg_threads to 1\n",
7227  thr, tmp));
7228  tmp->up = thr->th.th_cg_roots;
7229  thr->th.th_cg_roots = tmp;
7230 
7231 // Launch league of teams now, but not let workers execute
7232 // (they hang on fork barrier until next parallel)
7233 #if INCLUDE_SSC_MARKS
7234  SSC_MARK_FORKING();
7235 #endif
7236  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7237  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7238  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7239 #if INCLUDE_SSC_MARKS
7240  SSC_MARK_JOINING();
7241 #endif
7242  // If the team size was reduced from the limit, set it to the new size
7243  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7244  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7245  // AC: last parameter "1" eliminates join barrier which won't work because
7246  // worker threads are in a fork barrier waiting for more parallel regions
7247  __kmp_join_call(loc, gtid
7248 #if OMPT_SUPPORT
7249  ,
7250  fork_context_intel
7251 #endif
7252  ,
7253  1);
7254 }
7255 
7256 int __kmp_invoke_teams_master(int gtid) {
7257  kmp_info_t *this_thr = __kmp_threads[gtid];
7258  kmp_team_t *team = this_thr->th.th_team;
7259 #if KMP_DEBUG
7260  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7261  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7262  (void *)__kmp_teams_master);
7263 #endif
7264  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7265  __kmp_teams_master(gtid);
7266  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7267  return 1;
7268 }
7269 #endif /* OMP_40_ENABLED */
7270 
7271 /* this sets the requested number of threads for the next parallel region
7272  encountered by this team. since this should be enclosed in the forkjoin
7273  critical section it should avoid race conditions with assymmetrical nested
7274  parallelism */
7275 
7276 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7277  kmp_info_t *thr = __kmp_threads[gtid];
7278 
7279  if (num_threads > 0)
7280  thr->th.th_set_nproc = num_threads;
7281 }
7282 
7283 #if OMP_40_ENABLED
7284 
7285 /* this sets the requested number of teams for the teams region and/or
7286  the number of threads for the next parallel region encountered */
7287 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7288  int num_threads) {
7289  kmp_info_t *thr = __kmp_threads[gtid];
7290  KMP_DEBUG_ASSERT(num_teams >= 0);
7291  KMP_DEBUG_ASSERT(num_threads >= 0);
7292 
7293  if (num_teams == 0)
7294  num_teams = 1; // default number of teams is 1.
7295  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7296  if (!__kmp_reserve_warn) {
7297  __kmp_reserve_warn = 1;
7298  __kmp_msg(kmp_ms_warning,
7299  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7300  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7301  }
7302  num_teams = __kmp_teams_max_nth;
7303  }
7304  // Set number of teams (number of threads in the outer "parallel" of the
7305  // teams)
7306  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7307 
7308  // Remember the number of threads for inner parallel regions
7309  if (num_threads == 0) {
7310  if (!TCR_4(__kmp_init_middle))
7311  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7312  num_threads = __kmp_avail_proc / num_teams;
7313  if (num_teams * num_threads > __kmp_teams_max_nth) {
7314  // adjust num_threads w/o warning as it is not user setting
7315  num_threads = __kmp_teams_max_nth / num_teams;
7316  }
7317  } else {
7318  // This thread will be the master of the league masters
7319  // Store new thread limit; old limit is saved in th_cg_roots list
7320  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7321 
7322  if (num_teams * num_threads > __kmp_teams_max_nth) {
7323  int new_threads = __kmp_teams_max_nth / num_teams;
7324  if (!__kmp_reserve_warn) { // user asked for too many threads
7325  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7326  __kmp_msg(kmp_ms_warning,
7327  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7328  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7329  }
7330  num_threads = new_threads;
7331  }
7332  }
7333  thr->th.th_teams_size.nth = num_threads;
7334 }
7335 
7336 // Set the proc_bind var to use in the following parallel region.
7337 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7338  kmp_info_t *thr = __kmp_threads[gtid];
7339  thr->th.th_set_proc_bind = proc_bind;
7340 }
7341 
7342 #endif /* OMP_40_ENABLED */
7343 
7344 /* Launch the worker threads into the microtask. */
7345 
7346 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7347  kmp_info_t *this_thr = __kmp_threads[gtid];
7348 
7349 #ifdef KMP_DEBUG
7350  int f;
7351 #endif /* KMP_DEBUG */
7352 
7353  KMP_DEBUG_ASSERT(team);
7354  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7355  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7356  KMP_MB(); /* Flush all pending memory write invalidates. */
7357 
7358  team->t.t_construct = 0; /* no single directives seen yet */
7359  team->t.t_ordered.dt.t_value =
7360  0; /* thread 0 enters the ordered section first */
7361 
7362  /* Reset the identifiers on the dispatch buffer */
7363  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7364  if (team->t.t_max_nproc > 1) {
7365  int i;
7366  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7367  team->t.t_disp_buffer[i].buffer_index = i;
7368 #if OMP_45_ENABLED
7369  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7370 #endif
7371  }
7372  } else {
7373  team->t.t_disp_buffer[0].buffer_index = 0;
7374 #if OMP_45_ENABLED
7375  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7376 #endif
7377  }
7378 
7379  KMP_MB(); /* Flush all pending memory write invalidates. */
7380  KMP_ASSERT(this_thr->th.th_team == team);
7381 
7382 #ifdef KMP_DEBUG
7383  for (f = 0; f < team->t.t_nproc; f++) {
7384  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7385  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7386  }
7387 #endif /* KMP_DEBUG */
7388 
7389  /* release the worker threads so they may begin working */
7390  __kmp_fork_barrier(gtid, 0);
7391 }
7392 
7393 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7394  kmp_info_t *this_thr = __kmp_threads[gtid];
7395 
7396  KMP_DEBUG_ASSERT(team);
7397  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7398  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7399  KMP_MB(); /* Flush all pending memory write invalidates. */
7400 
7401 /* Join barrier after fork */
7402 
7403 #ifdef KMP_DEBUG
7404  if (__kmp_threads[gtid] &&
7405  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7406  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7407  __kmp_threads[gtid]);
7408  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7409  "team->t.t_nproc=%d\n",
7410  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7411  team->t.t_nproc);
7412  __kmp_print_structure();
7413  }
7414  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7415  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7416 #endif /* KMP_DEBUG */
7417 
7418  __kmp_join_barrier(gtid); /* wait for everyone */
7419 #if OMPT_SUPPORT
7420  if (ompt_enabled.enabled &&
7421  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7422  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7423  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7424  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7425 #if OMPT_OPTIONAL
7426  void *codeptr = NULL;
7427  if (KMP_MASTER_TID(ds_tid) &&
7428  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7429  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7430  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7431 
7432  if (ompt_enabled.ompt_callback_sync_region_wait) {
7433  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7434  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7435  codeptr);
7436  }
7437  if (ompt_enabled.ompt_callback_sync_region) {
7438  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7439  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7440  codeptr);
7441  }
7442 #endif
7443  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7444  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7445  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7446  }
7447  }
7448 #endif
7449 
7450  KMP_MB(); /* Flush all pending memory write invalidates. */
7451  KMP_ASSERT(this_thr->th.th_team == team);
7452 }
7453 
7454 /* ------------------------------------------------------------------------ */
7455 
7456 #ifdef USE_LOAD_BALANCE
7457 
7458 // Return the worker threads actively spinning in the hot team, if we
7459 // are at the outermost level of parallelism. Otherwise, return 0.
7460 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7461  int i;
7462  int retval;
7463  kmp_team_t *hot_team;
7464 
7465  if (root->r.r_active) {
7466  return 0;
7467  }
7468  hot_team = root->r.r_hot_team;
7469  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7470  return hot_team->t.t_nproc - 1; // Don't count master thread
7471  }
7472 
7473  // Skip the master thread - it is accounted for elsewhere.
7474  retval = 0;
7475  for (i = 1; i < hot_team->t.t_nproc; i++) {
7476  if (hot_team->t.t_threads[i]->th.th_active) {
7477  retval++;
7478  }
7479  }
7480  return retval;
7481 }
7482 
7483 // Perform an automatic adjustment to the number of
7484 // threads used by the next parallel region.
7485 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7486  int retval;
7487  int pool_active;
7488  int hot_team_active;
7489  int team_curr_active;
7490  int system_active;
7491 
7492  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7493  set_nproc));
7494  KMP_DEBUG_ASSERT(root);
7495  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7496  ->th.th_current_task->td_icvs.dynamic == TRUE);
7497  KMP_DEBUG_ASSERT(set_nproc > 1);
7498 
7499  if (set_nproc == 1) {
7500  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7501  return 1;
7502  }
7503 
7504  // Threads that are active in the thread pool, active in the hot team for this
7505  // particular root (if we are at the outer par level), and the currently
7506  // executing thread (to become the master) are available to add to the new
7507  // team, but are currently contributing to the system load, and must be
7508  // accounted for.
7509  pool_active = __kmp_thread_pool_active_nth;
7510  hot_team_active = __kmp_active_hot_team_nproc(root);
7511  team_curr_active = pool_active + hot_team_active + 1;
7512 
7513  // Check the system load.
7514  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7515  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7516  "hot team active = %d\n",
7517  system_active, pool_active, hot_team_active));
7518 
7519  if (system_active < 0) {
7520  // There was an error reading the necessary info from /proc, so use the
7521  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7522  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7523  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7524  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7525 
7526  // Make this call behave like the thread limit algorithm.
7527  retval = __kmp_avail_proc - __kmp_nth +
7528  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7529  if (retval > set_nproc) {
7530  retval = set_nproc;
7531  }
7532  if (retval < KMP_MIN_NTH) {
7533  retval = KMP_MIN_NTH;
7534  }
7535 
7536  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7537  retval));
7538  return retval;
7539  }
7540 
7541  // There is a slight delay in the load balance algorithm in detecting new
7542  // running procs. The real system load at this instant should be at least as
7543  // large as the #active omp thread that are available to add to the team.
7544  if (system_active < team_curr_active) {
7545  system_active = team_curr_active;
7546  }
7547  retval = __kmp_avail_proc - system_active + team_curr_active;
7548  if (retval > set_nproc) {
7549  retval = set_nproc;
7550  }
7551  if (retval < KMP_MIN_NTH) {
7552  retval = KMP_MIN_NTH;
7553  }
7554 
7555  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7556  return retval;
7557 } // __kmp_load_balance_nproc()
7558 
7559 #endif /* USE_LOAD_BALANCE */
7560 
7561 /* ------------------------------------------------------------------------ */
7562 
7563 /* NOTE: this is called with the __kmp_init_lock held */
7564 void __kmp_cleanup(void) {
7565  int f;
7566 
7567  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7568 
7569  if (TCR_4(__kmp_init_parallel)) {
7570 #if KMP_HANDLE_SIGNALS
7571  __kmp_remove_signals();
7572 #endif
7573  TCW_4(__kmp_init_parallel, FALSE);
7574  }
7575 
7576  if (TCR_4(__kmp_init_middle)) {
7577 #if KMP_AFFINITY_SUPPORTED
7578  __kmp_affinity_uninitialize();
7579 #endif /* KMP_AFFINITY_SUPPORTED */
7580  __kmp_cleanup_hierarchy();
7581  TCW_4(__kmp_init_middle, FALSE);
7582  }
7583 
7584  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7585 
7586  if (__kmp_init_serial) {
7587  __kmp_runtime_destroy();
7588  __kmp_init_serial = FALSE;
7589  }
7590 
7591  __kmp_cleanup_threadprivate_caches();
7592 
7593  for (f = 0; f < __kmp_threads_capacity; f++) {
7594  if (__kmp_root[f] != NULL) {
7595  __kmp_free(__kmp_root[f]);
7596  __kmp_root[f] = NULL;
7597  }
7598  }
7599  __kmp_free(__kmp_threads);
7600  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7601  // there is no need in freeing __kmp_root.
7602  __kmp_threads = NULL;
7603  __kmp_root = NULL;
7604  __kmp_threads_capacity = 0;
7605 
7606 #if KMP_USE_DYNAMIC_LOCK
7607  __kmp_cleanup_indirect_user_locks();
7608 #else
7609  __kmp_cleanup_user_locks();
7610 #endif
7611 
7612 #if KMP_AFFINITY_SUPPORTED
7613  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7614  __kmp_cpuinfo_file = NULL;
7615 #endif /* KMP_AFFINITY_SUPPORTED */
7616 
7617 #if KMP_USE_ADAPTIVE_LOCKS
7618 #if KMP_DEBUG_ADAPTIVE_LOCKS
7619  __kmp_print_speculative_stats();
7620 #endif
7621 #endif
7622  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7623  __kmp_nested_nth.nth = NULL;
7624  __kmp_nested_nth.size = 0;
7625  __kmp_nested_nth.used = 0;
7626  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7627  __kmp_nested_proc_bind.bind_types = NULL;
7628  __kmp_nested_proc_bind.size = 0;
7629  __kmp_nested_proc_bind.used = 0;
7630 #if OMP_50_ENABLED
7631  if (__kmp_affinity_format) {
7632  KMP_INTERNAL_FREE(__kmp_affinity_format);
7633  __kmp_affinity_format = NULL;
7634  }
7635 #endif
7636 
7637  __kmp_i18n_catclose();
7638 
7639 #if KMP_USE_HIER_SCHED
7640  __kmp_hier_scheds.deallocate();
7641 #endif
7642 
7643 #if KMP_STATS_ENABLED
7644  __kmp_stats_fini();
7645 #endif
7646 
7647  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7648 }
7649 
7650 /* ------------------------------------------------------------------------ */
7651 
7652 int __kmp_ignore_mppbeg(void) {
7653  char *env;
7654 
7655  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7656  if (__kmp_str_match_false(env))
7657  return FALSE;
7658  }
7659  // By default __kmpc_begin() is no-op.
7660  return TRUE;
7661 }
7662 
7663 int __kmp_ignore_mppend(void) {
7664  char *env;
7665 
7666  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7667  if (__kmp_str_match_false(env))
7668  return FALSE;
7669  }
7670  // By default __kmpc_end() is no-op.
7671  return TRUE;
7672 }
7673 
7674 void __kmp_internal_begin(void) {
7675  int gtid;
7676  kmp_root_t *root;
7677 
7678  /* this is a very important step as it will register new sibling threads
7679  and assign these new uber threads a new gtid */
7680  gtid = __kmp_entry_gtid();
7681  root = __kmp_threads[gtid]->th.th_root;
7682  KMP_ASSERT(KMP_UBER_GTID(gtid));
7683 
7684  if (root->r.r_begin)
7685  return;
7686  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7687  if (root->r.r_begin) {
7688  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7689  return;
7690  }
7691 
7692  root->r.r_begin = TRUE;
7693 
7694  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7695 }
7696 
7697 /* ------------------------------------------------------------------------ */
7698 
7699 void __kmp_user_set_library(enum library_type arg) {
7700  int gtid;
7701  kmp_root_t *root;
7702  kmp_info_t *thread;
7703 
7704  /* first, make sure we are initialized so we can get our gtid */
7705 
7706  gtid = __kmp_entry_gtid();
7707  thread = __kmp_threads[gtid];
7708 
7709  root = thread->th.th_root;
7710 
7711  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7712  library_serial));
7713  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7714  thread */
7715  KMP_WARNING(SetLibraryIncorrectCall);
7716  return;
7717  }
7718 
7719  switch (arg) {
7720  case library_serial:
7721  thread->th.th_set_nproc = 0;
7722  set__nproc(thread, 1);
7723  break;
7724  case library_turnaround:
7725  thread->th.th_set_nproc = 0;
7726  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7727  : __kmp_dflt_team_nth_ub);
7728  break;
7729  case library_throughput:
7730  thread->th.th_set_nproc = 0;
7731  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7732  : __kmp_dflt_team_nth_ub);
7733  break;
7734  default:
7735  KMP_FATAL(UnknownLibraryType, arg);
7736  }
7737 
7738  __kmp_aux_set_library(arg);
7739 }
7740 
7741 void __kmp_aux_set_stacksize(size_t arg) {
7742  if (!__kmp_init_serial)
7743  __kmp_serial_initialize();
7744 
7745 #if KMP_OS_DARWIN
7746  if (arg & (0x1000 - 1)) {
7747  arg &= ~(0x1000 - 1);
7748  if (arg + 0x1000) /* check for overflow if we round up */
7749  arg += 0x1000;
7750  }
7751 #endif
7752  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7753 
7754  /* only change the default stacksize before the first parallel region */
7755  if (!TCR_4(__kmp_init_parallel)) {
7756  size_t value = arg; /* argument is in bytes */
7757 
7758  if (value < __kmp_sys_min_stksize)
7759  value = __kmp_sys_min_stksize;
7760  else if (value > KMP_MAX_STKSIZE)
7761  value = KMP_MAX_STKSIZE;
7762 
7763  __kmp_stksize = value;
7764 
7765  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7766  }
7767 
7768  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7769 }
7770 
7771 /* set the behaviour of the runtime library */
7772 /* TODO this can cause some odd behaviour with sibling parallelism... */
7773 void __kmp_aux_set_library(enum library_type arg) {
7774  __kmp_library = arg;
7775 
7776  switch (__kmp_library) {
7777  case library_serial: {
7778  KMP_INFORM(LibraryIsSerial);
7779  } break;
7780  case library_turnaround:
7781  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7782  __kmp_use_yield = 2; // only yield when oversubscribed
7783  break;
7784  case library_throughput:
7785  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7786  __kmp_dflt_blocktime = 200;
7787  break;
7788  default:
7789  KMP_FATAL(UnknownLibraryType, arg);
7790  }
7791 }
7792 
7793 /* Getting team information common for all team API */
7794 // Returns NULL if not in teams construct
7795 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7796  kmp_info_t *thr = __kmp_entry_thread();
7797  teams_serialized = 0;
7798  if (thr->th.th_teams_microtask) {
7799  kmp_team_t *team = thr->th.th_team;
7800  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7801  int ii = team->t.t_level;
7802  teams_serialized = team->t.t_serialized;
7803  int level = tlevel + 1;
7804  KMP_DEBUG_ASSERT(ii >= tlevel);
7805  while (ii > level) {
7806  for (teams_serialized = team->t.t_serialized;
7807  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7808  }
7809  if (team->t.t_serialized && (!teams_serialized)) {
7810  team = team->t.t_parent;
7811  continue;
7812  }
7813  if (ii > level) {
7814  team = team->t.t_parent;
7815  ii--;
7816  }
7817  }
7818  return team;
7819  }
7820  return NULL;
7821 }
7822 
7823 int __kmp_aux_get_team_num() {
7824  int serialized;
7825  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7826  if (team) {
7827  if (serialized > 1) {
7828  return 0; // teams region is serialized ( 1 team of 1 thread ).
7829  } else {
7830  return team->t.t_master_tid;
7831  }
7832  }
7833  return 0;
7834 }
7835 
7836 int __kmp_aux_get_num_teams() {
7837  int serialized;
7838  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7839  if (team) {
7840  if (serialized > 1) {
7841  return 1;
7842  } else {
7843  return team->t.t_parent->t.t_nproc;
7844  }
7845  }
7846  return 1;
7847 }
7848 
7849 /* ------------------------------------------------------------------------ */
7850 
7851 #if OMP_50_ENABLED
7852 /*
7853  * Affinity Format Parser
7854  *
7855  * Field is in form of: %[[[0].]size]type
7856  * % and type are required (%% means print a literal '%')
7857  * type is either single char or long name surrounded by {},
7858  * e.g., N or {num_threads}
7859  * 0 => leading zeros
7860  * . => right justified when size is specified
7861  * by default output is left justified
7862  * size is the *minimum* field length
7863  * All other characters are printed as is
7864  *
7865  * Available field types:
7866  * L {thread_level} - omp_get_level()
7867  * n {thread_num} - omp_get_thread_num()
7868  * h {host} - name of host machine
7869  * P {process_id} - process id (integer)
7870  * T {thread_identifier} - native thread identifier (integer)
7871  * N {num_threads} - omp_get_num_threads()
7872  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7873  * a {thread_affinity} - comma separated list of integers or integer ranges
7874  * (values of affinity mask)
7875  *
7876  * Implementation-specific field types can be added
7877  * If a type is unknown, print "undefined"
7878 */
7879 
7880 // Structure holding the short name, long name, and corresponding data type
7881 // for snprintf. A table of these will represent the entire valid keyword
7882 // field types.
7883 typedef struct kmp_affinity_format_field_t {
7884  char short_name; // from spec e.g., L -> thread level
7885  const char *long_name; // from spec thread_level -> thread level
7886  char field_format; // data type for snprintf (typically 'd' or 's'
7887  // for integer or string)
7888 } kmp_affinity_format_field_t;
7889 
7890 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7891 #if KMP_AFFINITY_SUPPORTED
7892  {'A', "thread_affinity", 's'},
7893 #endif
7894  {'t', "team_num", 'd'},
7895  {'T', "num_teams", 'd'},
7896  {'L', "nesting_level", 'd'},
7897  {'n', "thread_num", 'd'},
7898  {'N', "num_threads", 'd'},
7899  {'a', "ancestor_tnum", 'd'},
7900  {'H', "host", 's'},
7901  {'P', "process_id", 'd'},
7902  {'i', "native_thread_id", 'd'}};
7903 
7904 // Return the number of characters it takes to hold field
7905 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7906  const char **ptr,
7907  kmp_str_buf_t *field_buffer) {
7908  int rc, format_index, field_value;
7909  const char *width_left, *width_right;
7910  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7911  static const int FORMAT_SIZE = 20;
7912  char format[FORMAT_SIZE] = {0};
7913  char absolute_short_name = 0;
7914 
7915  KMP_DEBUG_ASSERT(gtid >= 0);
7916  KMP_DEBUG_ASSERT(th);
7917  KMP_DEBUG_ASSERT(**ptr == '%');
7918  KMP_DEBUG_ASSERT(field_buffer);
7919 
7920  __kmp_str_buf_clear(field_buffer);
7921 
7922  // Skip the initial %
7923  (*ptr)++;
7924 
7925  // Check for %% first
7926  if (**ptr == '%') {
7927  __kmp_str_buf_cat(field_buffer, "%", 1);
7928  (*ptr)++; // skip over the second %
7929  return 1;
7930  }
7931 
7932  // Parse field modifiers if they are present
7933  pad_zeros = false;
7934  if (**ptr == '0') {
7935  pad_zeros = true;
7936  (*ptr)++; // skip over 0
7937  }
7938  right_justify = false;
7939  if (**ptr == '.') {
7940  right_justify = true;
7941  (*ptr)++; // skip over .
7942  }
7943  // Parse width of field: [width_left, width_right)
7944  width_left = width_right = NULL;
7945  if (**ptr >= '0' && **ptr <= '9') {
7946  width_left = *ptr;
7947  SKIP_DIGITS(*ptr);
7948  width_right = *ptr;
7949  }
7950 
7951  // Create the format for KMP_SNPRINTF based on flags parsed above
7952  format_index = 0;
7953  format[format_index++] = '%';
7954  if (!right_justify)
7955  format[format_index++] = '-';
7956  if (pad_zeros)
7957  format[format_index++] = '0';
7958  if (width_left && width_right) {
7959  int i = 0;
7960  // Only allow 8 digit number widths.
7961  // This also prevents overflowing format variable
7962  while (i < 8 && width_left < width_right) {
7963  format[format_index++] = *width_left;
7964  width_left++;
7965  i++;
7966  }
7967  }
7968 
7969  // Parse a name (long or short)
7970  // Canonicalize the name into absolute_short_name
7971  found_valid_name = false;
7972  parse_long_name = (**ptr == '{');
7973  if (parse_long_name)
7974  (*ptr)++; // skip initial left brace
7975  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7976  sizeof(__kmp_affinity_format_table[0]);
7977  ++i) {
7978  char short_name = __kmp_affinity_format_table[i].short_name;
7979  const char *long_name = __kmp_affinity_format_table[i].long_name;
7980  char field_format = __kmp_affinity_format_table[i].field_format;
7981  if (parse_long_name) {
7982  int length = KMP_STRLEN(long_name);
7983  if (strncmp(*ptr, long_name, length) == 0) {
7984  found_valid_name = true;
7985  (*ptr) += length; // skip the long name
7986  }
7987  } else if (**ptr == short_name) {
7988  found_valid_name = true;
7989  (*ptr)++; // skip the short name
7990  }
7991  if (found_valid_name) {
7992  format[format_index++] = field_format;
7993  format[format_index++] = '\0';
7994  absolute_short_name = short_name;
7995  break;
7996  }
7997  }
7998  if (parse_long_name) {
7999  if (**ptr != '}') {
8000  absolute_short_name = 0;
8001  } else {
8002  (*ptr)++; // skip over the right brace
8003  }
8004  }
8005 
8006  // Attempt to fill the buffer with the requested
8007  // value using snprintf within __kmp_str_buf_print()
8008  switch (absolute_short_name) {
8009  case 't':
8010  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8011  break;
8012  case 'T':
8013  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8014  break;
8015  case 'L':
8016  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8017  break;
8018  case 'n':
8019  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8020  break;
8021  case 'H': {
8022  static const int BUFFER_SIZE = 256;
8023  char buf[BUFFER_SIZE];
8024  __kmp_expand_host_name(buf, BUFFER_SIZE);
8025  rc = __kmp_str_buf_print(field_buffer, format, buf);
8026  } break;
8027  case 'P':
8028  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8029  break;
8030  case 'i':
8031  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8032  break;
8033  case 'N':
8034  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8035  break;
8036  case 'a':
8037  field_value =
8038  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8039  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8040  break;
8041 #if KMP_AFFINITY_SUPPORTED
8042  case 'A': {
8043  kmp_str_buf_t buf;
8044  __kmp_str_buf_init(&buf);
8045  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8046  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8047  __kmp_str_buf_free(&buf);
8048  } break;
8049 #endif
8050  default:
8051  // According to spec, If an implementation does not have info for field
8052  // type, then "undefined" is printed
8053  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8054  // Skip the field
8055  if (parse_long_name) {
8056  SKIP_TOKEN(*ptr);
8057  if (**ptr == '}')
8058  (*ptr)++;
8059  } else {
8060  (*ptr)++;
8061  }
8062  }
8063 
8064  KMP_ASSERT(format_index <= FORMAT_SIZE);
8065  return rc;
8066 }
8067 
8068 /*
8069  * Return number of characters needed to hold the affinity string
8070  * (not including null byte character)
8071  * The resultant string is printed to buffer, which the caller can then
8072  * handle afterwards
8073 */
8074 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8075  kmp_str_buf_t *buffer) {
8076  const char *parse_ptr;
8077  size_t retval;
8078  const kmp_info_t *th;
8079  kmp_str_buf_t field;
8080 
8081  KMP_DEBUG_ASSERT(buffer);
8082  KMP_DEBUG_ASSERT(gtid >= 0);
8083 
8084  __kmp_str_buf_init(&field);
8085  __kmp_str_buf_clear(buffer);
8086 
8087  th = __kmp_threads[gtid];
8088  retval = 0;
8089 
8090  // If format is NULL or zero-length string, then we use
8091  // affinity-format-var ICV
8092  parse_ptr = format;
8093  if (parse_ptr == NULL || *parse_ptr == '\0') {
8094  parse_ptr = __kmp_affinity_format;
8095  }
8096  KMP_DEBUG_ASSERT(parse_ptr);
8097 
8098  while (*parse_ptr != '\0') {
8099  // Parse a field
8100  if (*parse_ptr == '%') {
8101  // Put field in the buffer
8102  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8103  __kmp_str_buf_catbuf(buffer, &field);
8104  retval += rc;
8105  } else {
8106  // Put literal character in buffer
8107  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8108  retval++;
8109  parse_ptr++;
8110  }
8111  }
8112  __kmp_str_buf_free(&field);
8113  return retval;
8114 }
8115 
8116 // Displays the affinity string to stdout
8117 void __kmp_aux_display_affinity(int gtid, const char *format) {
8118  kmp_str_buf_t buf;
8119  __kmp_str_buf_init(&buf);
8120  __kmp_aux_capture_affinity(gtid, format, &buf);
8121  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8122  __kmp_str_buf_free(&buf);
8123 }
8124 #endif // OMP_50_ENABLED
8125 
8126 /* ------------------------------------------------------------------------ */
8127 
8128 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8129  int blocktime = arg; /* argument is in milliseconds */
8130 #if KMP_USE_MONITOR
8131  int bt_intervals;
8132 #endif
8133  int bt_set;
8134 
8135  __kmp_save_internal_controls(thread);
8136 
8137  /* Normalize and set blocktime for the teams */
8138  if (blocktime < KMP_MIN_BLOCKTIME)
8139  blocktime = KMP_MIN_BLOCKTIME;
8140  else if (blocktime > KMP_MAX_BLOCKTIME)
8141  blocktime = KMP_MAX_BLOCKTIME;
8142 
8143  set__blocktime_team(thread->th.th_team, tid, blocktime);
8144  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8145 
8146 #if KMP_USE_MONITOR
8147  /* Calculate and set blocktime intervals for the teams */
8148  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8149 
8150  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8151  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8152 #endif
8153 
8154  /* Set whether blocktime has been set to "TRUE" */
8155  bt_set = TRUE;
8156 
8157  set__bt_set_team(thread->th.th_team, tid, bt_set);
8158  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8159 #if KMP_USE_MONITOR
8160  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8161  "bt_intervals=%d, monitor_updates=%d\n",
8162  __kmp_gtid_from_tid(tid, thread->th.th_team),
8163  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8164  __kmp_monitor_wakeups));
8165 #else
8166  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8167  __kmp_gtid_from_tid(tid, thread->th.th_team),
8168  thread->th.th_team->t.t_id, tid, blocktime));
8169 #endif
8170 }
8171 
8172 void __kmp_aux_set_defaults(char const *str, int len) {
8173  if (!__kmp_init_serial) {
8174  __kmp_serial_initialize();
8175  }
8176  __kmp_env_initialize(str);
8177 
8178  if (__kmp_settings
8179 #if OMP_40_ENABLED
8180  || __kmp_display_env || __kmp_display_env_verbose
8181 #endif // OMP_40_ENABLED
8182  ) {
8183  __kmp_env_print();
8184  }
8185 } // __kmp_aux_set_defaults
8186 
8187 /* ------------------------------------------------------------------------ */
8188 /* internal fast reduction routines */
8189 
8190 PACKED_REDUCTION_METHOD_T
8191 __kmp_determine_reduction_method(
8192  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8193  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8194  kmp_critical_name *lck) {
8195 
8196  // Default reduction method: critical construct ( lck != NULL, like in current
8197  // PAROPT )
8198  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8199  // can be selected by RTL
8200  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8201  // can be selected by RTL
8202  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8203  // among generated by PAROPT.
8204 
8205  PACKED_REDUCTION_METHOD_T retval;
8206 
8207  int team_size;
8208 
8209  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8210  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8211 
8212 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8213  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8214 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8215 
8216  retval = critical_reduce_block;
8217 
8218  // another choice of getting a team size (with 1 dynamic deference) is slower
8219  team_size = __kmp_get_team_num_threads(global_tid);
8220  if (team_size == 1) {
8221 
8222  retval = empty_reduce_block;
8223 
8224  } else {
8225 
8226  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8227 
8228 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8229 
8230 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8231  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_KFREEBSD
8232 
8233  int teamsize_cutoff = 4;
8234 
8235 #if KMP_MIC_SUPPORTED
8236  if (__kmp_mic_type != non_mic) {
8237  teamsize_cutoff = 8;
8238  }
8239 #endif
8240  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8241  if (tree_available) {
8242  if (team_size <= teamsize_cutoff) {
8243  if (atomic_available) {
8244  retval = atomic_reduce_block;
8245  }
8246  } else {
8247  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8248  }
8249  } else if (atomic_available) {
8250  retval = atomic_reduce_block;
8251  }
8252 #else
8253 #error "Unknown or unsupported OS"
8254 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8255  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8256 
8257 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8258 
8259 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_KFREEBSD
8260 
8261  // basic tuning
8262 
8263  if (atomic_available) {
8264  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8265  retval = atomic_reduce_block;
8266  }
8267  } // otherwise: use critical section
8268 
8269 #elif KMP_OS_DARWIN
8270 
8271  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8272  if (atomic_available && (num_vars <= 3)) {
8273  retval = atomic_reduce_block;
8274  } else if (tree_available) {
8275  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8276  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8277  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8278  }
8279  } // otherwise: use critical section
8280 
8281 #else
8282 #error "Unknown or unsupported OS"
8283 #endif
8284 
8285 #else
8286 #error "Unknown or unsupported architecture"
8287 #endif
8288  }
8289 
8290  // KMP_FORCE_REDUCTION
8291 
8292  // If the team is serialized (team_size == 1), ignore the forced reduction
8293  // method and stay with the unsynchronized method (empty_reduce_block)
8294  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8295  team_size != 1) {
8296 
8297  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8298 
8299  int atomic_available, tree_available;
8300 
8301  switch ((forced_retval = __kmp_force_reduction_method)) {
8302  case critical_reduce_block:
8303  KMP_ASSERT(lck); // lck should be != 0
8304  break;
8305 
8306  case atomic_reduce_block:
8307  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8308  if (!atomic_available) {
8309  KMP_WARNING(RedMethodNotSupported, "atomic");
8310  forced_retval = critical_reduce_block;
8311  }
8312  break;
8313 
8314  case tree_reduce_block:
8315  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8316  if (!tree_available) {
8317  KMP_WARNING(RedMethodNotSupported, "tree");
8318  forced_retval = critical_reduce_block;
8319  } else {
8320 #if KMP_FAST_REDUCTION_BARRIER
8321  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8322 #endif
8323  }
8324  break;
8325 
8326  default:
8327  KMP_ASSERT(0); // "unsupported method specified"
8328  }
8329 
8330  retval = forced_retval;
8331  }
8332 
8333  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8334 
8335 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8336 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8337 
8338  return (retval);
8339 }
8340 
8341 // this function is for testing set/get/determine reduce method
8342 kmp_int32 __kmp_get_reduce_method(void) {
8343  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8344 }
8345 
8346 #if OMP_50_ENABLED
8347 
8348 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8349 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8350 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8351 
8352 // Hard pause shuts down the runtime completely. Resume happens naturally when
8353 // OpenMP is used subsequently.
8354 void __kmp_hard_pause() {
8355  __kmp_pause_status = kmp_hard_paused;
8356  __kmp_internal_end_thread(-1);
8357 }
8358 
8359 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8360 void __kmp_resume_if_soft_paused() {
8361  if (__kmp_pause_status == kmp_soft_paused) {
8362  __kmp_pause_status = kmp_not_paused;
8363 
8364  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8365  kmp_info_t *thread = __kmp_threads[gtid];
8366  if (thread) { // Wake it if sleeping
8367  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8368  if (fl.is_sleeping())
8369  fl.resume(gtid);
8370  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8371  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8372  } else { // thread holds the lock and may sleep soon
8373  do { // until either the thread sleeps, or we can get the lock
8374  if (fl.is_sleeping()) {
8375  fl.resume(gtid);
8376  break;
8377  } else if (__kmp_try_suspend_mx(thread)) {
8378  __kmp_unlock_suspend_mx(thread);
8379  break;
8380  }
8381  } while (1);
8382  }
8383  }
8384  }
8385  }
8386 }
8387 
8388 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8389 // TODO: add warning messages
8390 int __kmp_pause_resource(kmp_pause_status_t level) {
8391  if (level == kmp_not_paused) { // requesting resume
8392  if (__kmp_pause_status == kmp_not_paused) {
8393  // error message about runtime not being paused, so can't resume
8394  return 1;
8395  } else {
8396  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8397  __kmp_pause_status == kmp_hard_paused);
8398  __kmp_pause_status = kmp_not_paused;
8399  return 0;
8400  }
8401  } else if (level == kmp_soft_paused) { // requesting soft pause
8402  if (__kmp_pause_status != kmp_not_paused) {
8403  // error message about already being paused
8404  return 1;
8405  } else {
8406  __kmp_soft_pause();
8407  return 0;
8408  }
8409  } else if (level == kmp_hard_paused) { // requesting hard pause
8410  if (__kmp_pause_status != kmp_not_paused) {
8411  // error message about already being paused
8412  return 1;
8413  } else {
8414  __kmp_hard_pause();
8415  return 0;
8416  }
8417  } else {
8418  // error message about invalid level
8419  return 1;
8420  }
8421 }
8422 
8423 #endif // OMP_50_ENABLED
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:929
sched_type
Definition: kmp.h:343
Definition: kmp.h:229
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
kmp_int32 flags
Definition: kmp.h:231