LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  * it may change values between parallel regions. __kmp_max_nth
17  * is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44 
45  KMP_DEBUG_ASSERT(gtid_ref);
46 
47  if (__kmp_env_consistency_check) {
48  th = __kmp_threads[*gtid_ref];
49  if (th->th.th_root->r.r_active &&
50  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56  }
57  }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62 
63  if (__kmp_env_consistency_check) {
64  th = __kmp_threads[*gtid_ref];
65  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67  }
68  }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73  bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  int monotonicity;
76  // default to monotonic
77  monotonicity = SCHEDULE_MONOTONIC;
78  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79  monotonicity = SCHEDULE_NONMONOTONIC;
80  else if (SCHEDULE_HAS_MONOTONIC(schedule))
81  monotonicity = SCHEDULE_MONOTONIC;
82  return monotonicity;
83 }
84 
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk. The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride). nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used). tid is the id of the thread calling
90 // the function within the group of nproc threads. It will have a value
91 // between 0 and nproc - 1. This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97  dispatch_private_info_template<T> *pr,
98  enum sched_type schedule, T lb, T ub,
99  typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101  kmp_uint64 *cur_chunk,
102 #endif
103  typename traits_t<T>::signed_t chunk,
104  T nproc, T tid) {
105  typedef typename traits_t<T>::unsigned_t UT;
106  typedef typename traits_t<T>::floating_t DBL;
107 
108  int active;
109  T tc;
110  kmp_info_t *th;
111  kmp_team_t *team;
112  int monotonicity;
113  bool use_hier;
114 
115 #ifdef KMP_DEBUG
116  typedef typename traits_t<T>::signed_t ST;
117  {
118  char *buff;
119  // create format specifiers before the debug output
120  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123  traits_t<T>::spec, traits_t<T>::spec,
124  traits_t<ST>::spec, traits_t<ST>::spec,
125  traits_t<T>::spec, traits_t<T>::spec);
126  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127  __kmp_str_free(&buff);
128  }
129 #endif
130  /* setup data */
131  th = __kmp_threads[gtid];
132  team = th->th.th_team;
133  active = !team->t.t_serialized;
134 
135 #if USE_ITT_BUILD
136  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
137  __kmp_forkjoin_frames_mode == 3 &&
138  KMP_MASTER_GTID(gtid) &&
139 #if OMP_40_ENABLED
140  th->th.th_teams_microtask == NULL &&
141 #endif
142  team->t.t_active_level == 1;
143 #endif
144 
145 #if KMP_USE_HIER_SCHED
146  use_hier = pr->flags.use_hier;
147 #else
148  use_hier = false;
149 #endif
150 
151  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
152  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
153  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
154 
155  /* Pick up the nomerge/ordered bits from the scheduling type */
156  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
157  pr->flags.nomerge = TRUE;
158  schedule =
159  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
160  } else {
161  pr->flags.nomerge = FALSE;
162  }
163  pr->type_size = traits_t<T>::type_size; // remember the size of variables
164  if (kmp_ord_lower & schedule) {
165  pr->flags.ordered = TRUE;
166  schedule =
167  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
168  } else {
169  pr->flags.ordered = FALSE;
170  }
171  // Ordered overrides nonmonotonic
172  if (pr->flags.ordered) {
173  monotonicity = SCHEDULE_MONOTONIC;
174  }
175 
176  if (schedule == kmp_sch_static) {
177  schedule = __kmp_static;
178  } else {
179  if (schedule == kmp_sch_runtime) {
180  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
181  // not specified)
182  schedule = team->t.t_sched.r_sched_type;
183  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
184  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
185  // Detail the schedule if needed (global controls are differentiated
186  // appropriately)
187  if (schedule == kmp_sch_guided_chunked) {
188  schedule = __kmp_guided;
189  } else if (schedule == kmp_sch_static) {
190  schedule = __kmp_static;
191  }
192  // Use the chunk size specified by OMP_SCHEDULE (or default if not
193  // specified)
194  chunk = team->t.t_sched.chunk;
195 #if USE_ITT_BUILD
196  if (cur_chunk)
197  *cur_chunk = chunk;
198 #endif
199 #ifdef KMP_DEBUG
200  {
201  char *buff;
202  // create format specifiers before the debug output
203  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
204  "schedule:%%d chunk:%%%s\n",
205  traits_t<ST>::spec);
206  KD_TRACE(10, (buff, gtid, schedule, chunk));
207  __kmp_str_free(&buff);
208  }
209 #endif
210  } else {
211  if (schedule == kmp_sch_guided_chunked) {
212  schedule = __kmp_guided;
213  }
214  if (chunk <= 0) {
215  chunk = KMP_DEFAULT_CHUNK;
216  }
217  }
218 
219  if (schedule == kmp_sch_auto) {
220  // mapping and differentiation: in the __kmp_do_serial_initialize()
221  schedule = __kmp_auto;
222 #ifdef KMP_DEBUG
223  {
224  char *buff;
225  // create format specifiers before the debug output
226  buff = __kmp_str_format(
227  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
228  "schedule:%%d chunk:%%%s\n",
229  traits_t<ST>::spec);
230  KD_TRACE(10, (buff, gtid, schedule, chunk));
231  __kmp_str_free(&buff);
232  }
233 #endif
234  }
235 #if KMP_STATIC_STEAL_ENABLED
236  // map nonmonotonic:dynamic to static steal
237  if (schedule == kmp_sch_dynamic_chunked) {
238  if (monotonicity == SCHEDULE_NONMONOTONIC)
239  schedule = kmp_sch_static_steal;
240  }
241 #endif
242  /* guided analytical not safe for too many threads */
243  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
244  schedule = kmp_sch_guided_iterative_chunked;
245  KMP_WARNING(DispatchManyThreads);
246  }
247 #if OMP_45_ENABLED
248  if (schedule == kmp_sch_runtime_simd) {
249  // compiler provides simd_width in the chunk parameter
250  schedule = team->t.t_sched.r_sched_type;
251  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
252  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
253  // Detail the schedule if needed (global controls are differentiated
254  // appropriately)
255  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
256  schedule == __kmp_static) {
257  schedule = kmp_sch_static_balanced_chunked;
258  } else {
259  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
260  schedule = kmp_sch_guided_simd;
261  }
262  chunk = team->t.t_sched.chunk * chunk;
263  }
264 #if USE_ITT_BUILD
265  if (cur_chunk)
266  *cur_chunk = chunk;
267 #endif
268 #ifdef KMP_DEBUG
269  {
270  char *buff;
271  // create format specifiers before the debug output
272  buff = __kmp_str_format(
273  "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
274  " chunk:%%%s\n",
275  traits_t<ST>::spec);
276  KD_TRACE(10, (buff, gtid, schedule, chunk));
277  __kmp_str_free(&buff);
278  }
279 #endif
280  }
281 #endif // OMP_45_ENABLED
282  pr->u.p.parm1 = chunk;
283  }
284  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
285  "unknown scheduling type");
286 
287  pr->u.p.count = 0;
288 
289  if (__kmp_env_consistency_check) {
290  if (st == 0) {
291  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
292  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
293  }
294  }
295  // compute trip count
296  if (st == 1) { // most common case
297  if (ub >= lb) {
298  tc = ub - lb + 1;
299  } else { // ub < lb
300  tc = 0; // zero-trip
301  }
302  } else if (st < 0) {
303  if (lb >= ub) {
304  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
305  // where the division needs to be unsigned regardless of the result type
306  tc = (UT)(lb - ub) / (-st) + 1;
307  } else { // lb < ub
308  tc = 0; // zero-trip
309  }
310  } else { // st > 0
311  if (ub >= lb) {
312  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
313  // where the division needs to be unsigned regardless of the result type
314  tc = (UT)(ub - lb) / st + 1;
315  } else { // ub < lb
316  tc = 0; // zero-trip
317  }
318  }
319 
320 #if KMP_STATS_ENABLED
321  if (KMP_MASTER_GTID(gtid)) {
322  KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
323  }
324 #endif
325 
326  pr->u.p.lb = lb;
327  pr->u.p.ub = ub;
328  pr->u.p.st = st;
329  pr->u.p.tc = tc;
330 
331 #if KMP_OS_WINDOWS
332  pr->u.p.last_upper = ub + st;
333 #endif /* KMP_OS_WINDOWS */
334 
335  /* NOTE: only the active parallel region(s) has active ordered sections */
336 
337  if (active) {
338  if (pr->flags.ordered) {
339  pr->ordered_bumped = 0;
340  pr->u.p.ordered_lower = 1;
341  pr->u.p.ordered_upper = 0;
342  }
343  }
344 
345  switch (schedule) {
346 #if (KMP_STATIC_STEAL_ENABLED)
347  case kmp_sch_static_steal: {
348  T ntc, init;
349 
350  KD_TRACE(100,
351  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
352  gtid));
353 
354  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
355  if (nproc > 1 && ntc >= nproc) {
356  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
357  T id = tid;
358  T small_chunk, extras;
359 
360  small_chunk = ntc / nproc;
361  extras = ntc % nproc;
362 
363  init = id * small_chunk + (id < extras ? id : extras);
364  pr->u.p.count = init;
365  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
366 
367  pr->u.p.parm2 = lb;
368  // parm3 is the number of times to attempt stealing which is
369  // proportional to the number of chunks per thread up until
370  // the maximum value of nproc.
371  pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
372  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
373  pr->u.p.st = st;
374  if (traits_t<T>::type_size > 4) {
375  // AC: TODO: check if 16-byte CAS available and use it to
376  // improve performance (probably wait for explicit request
377  // before spending time on this).
378  // For now use dynamically allocated per-thread lock,
379  // free memory in __kmp_dispatch_next when status==0.
380  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
381  th->th.th_dispatch->th_steal_lock =
382  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
383  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
384  }
385  break;
386  } else {
387  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
388  "kmp_sch_static_balanced\n",
389  gtid));
390  schedule = kmp_sch_static_balanced;
391  /* too few iterations: fall-through to kmp_sch_static_balanced */
392  } // if
393  /* FALL-THROUGH to static balanced */
394  KMP_FALLTHROUGH();
395  } // case
396 #endif
397  case kmp_sch_static_balanced: {
398  T init, limit;
399 
400  KD_TRACE(
401  100,
402  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
403  gtid));
404 
405  if (nproc > 1) {
406  T id = tid;
407 
408  if (tc < nproc) {
409  if (id < tc) {
410  init = id;
411  limit = id;
412  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
413  } else {
414  pr->u.p.count = 1; /* means no more chunks to execute */
415  pr->u.p.parm1 = FALSE;
416  break;
417  }
418  } else {
419  T small_chunk = tc / nproc;
420  T extras = tc % nproc;
421  init = id * small_chunk + (id < extras ? id : extras);
422  limit = init + small_chunk - (id < extras ? 0 : 1);
423  pr->u.p.parm1 = (id == nproc - 1);
424  }
425  } else {
426  if (tc > 0) {
427  init = 0;
428  limit = tc - 1;
429  pr->u.p.parm1 = TRUE;
430  } else {
431  // zero trip count
432  pr->u.p.count = 1; /* means no more chunks to execute */
433  pr->u.p.parm1 = FALSE;
434  break;
435  }
436  }
437 #if USE_ITT_BUILD
438  // Calculate chunk for metadata report
439  if (itt_need_metadata_reporting)
440  if (cur_chunk)
441  *cur_chunk = limit - init + 1;
442 #endif
443  if (st == 1) {
444  pr->u.p.lb = lb + init;
445  pr->u.p.ub = lb + limit;
446  } else {
447  // calculated upper bound, "ub" is user-defined upper bound
448  T ub_tmp = lb + limit * st;
449  pr->u.p.lb = lb + init * st;
450  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
451  // it exactly
452  if (st > 0) {
453  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
454  } else {
455  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
456  }
457  }
458  if (pr->flags.ordered) {
459  pr->u.p.ordered_lower = init;
460  pr->u.p.ordered_upper = limit;
461  }
462  break;
463  } // case
464 #if OMP_45_ENABLED
465  case kmp_sch_static_balanced_chunked: {
466  // similar to balanced, but chunk adjusted to multiple of simd width
467  T nth = nproc;
468  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
469  " -> falling-through to static_greedy\n",
470  gtid));
471  schedule = kmp_sch_static_greedy;
472  if (nth > 1)
473  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
474  else
475  pr->u.p.parm1 = tc;
476  break;
477  } // case
478  case kmp_sch_guided_simd:
479 #endif // OMP_45_ENABLED
480  case kmp_sch_guided_iterative_chunked: {
481  KD_TRACE(
482  100,
483  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
484  " case\n",
485  gtid));
486 
487  if (nproc > 1) {
488  if ((2L * chunk + 1) * nproc >= tc) {
489  /* chunk size too large, switch to dynamic */
490  schedule = kmp_sch_dynamic_chunked;
491  } else {
492  // when remaining iters become less than parm2 - switch to dynamic
493  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
494  *(double *)&pr->u.p.parm3 =
495  guided_flt_param / nproc; // may occupy parm3 and parm4
496  }
497  } else {
498  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
499  "kmp_sch_static_greedy\n",
500  gtid));
501  schedule = kmp_sch_static_greedy;
502  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
503  KD_TRACE(
504  100,
505  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
506  gtid));
507  pr->u.p.parm1 = tc;
508  } // if
509  } // case
510  break;
511  case kmp_sch_guided_analytical_chunked: {
512  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
513  "kmp_sch_guided_analytical_chunked case\n",
514  gtid));
515 
516  if (nproc > 1) {
517  if ((2L * chunk + 1) * nproc >= tc) {
518  /* chunk size too large, switch to dynamic */
519  schedule = kmp_sch_dynamic_chunked;
520  } else {
521  /* commonly used term: (2 nproc - 1)/(2 nproc) */
522  DBL x;
523 
524 #if KMP_USE_X87CONTROL
525  /* Linux* OS already has 64-bit computation by default for long double,
526  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
527  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
528  instead of the default 53-bit. Even though long double doesn't work
529  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
530  expected to impact the correctness of the algorithm, but this has not
531  been mathematically proven. */
532  // save original FPCW and set precision to 64-bit, as
533  // Windows* OS on IA-32 architecture defaults to 53-bit
534  unsigned int oldFpcw = _control87(0, 0);
535  _control87(_PC_64, _MCW_PC); // 0,0x30000
536 #endif
537  /* value used for comparison in solver for cross-over point */
538  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
539 
540  /* crossover point--chunk indexes equal to or greater than
541  this point switch to dynamic-style scheduling */
542  UT cross;
543 
544  /* commonly used term: (2 nproc - 1)/(2 nproc) */
545  x = (long double)1.0 - (long double)0.5 / nproc;
546 
547 #ifdef KMP_DEBUG
548  { // test natural alignment
549  struct _test_a {
550  char a;
551  union {
552  char b;
553  DBL d;
554  };
555  } t;
556  ptrdiff_t natural_alignment =
557  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
558  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
559  // long)natural_alignment );
560  KMP_DEBUG_ASSERT(
561  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
562  }
563 #endif // KMP_DEBUG
564 
565  /* save the term in thread private dispatch structure */
566  *(DBL *)&pr->u.p.parm3 = x;
567 
568  /* solve for the crossover point to the nearest integer i for which C_i
569  <= chunk */
570  {
571  UT left, right, mid;
572  long double p;
573 
574  /* estimate initial upper and lower bound */
575 
576  /* doesn't matter what value right is as long as it is positive, but
577  it affects performance of the solver */
578  right = 229;
579  p = __kmp_pow<UT>(x, right);
580  if (p > target) {
581  do {
582  p *= p;
583  right <<= 1;
584  } while (p > target && right < (1 << 27));
585  /* lower bound is previous (failed) estimate of upper bound */
586  left = right >> 1;
587  } else {
588  left = 0;
589  }
590 
591  /* bisection root-finding method */
592  while (left + 1 < right) {
593  mid = (left + right) / 2;
594  if (__kmp_pow<UT>(x, mid) > target) {
595  left = mid;
596  } else {
597  right = mid;
598  }
599  } // while
600  cross = right;
601  }
602  /* assert sanity of computed crossover point */
603  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
604  __kmp_pow<UT>(x, cross) <= target);
605 
606  /* save the crossover point in thread private dispatch structure */
607  pr->u.p.parm2 = cross;
608 
609 // C75803
610 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
611 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
612 #else
613 #define GUIDED_ANALYTICAL_WORKAROUND (x)
614 #endif
615  /* dynamic-style scheduling offset */
616  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
617  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
618  cross * chunk;
619 #if KMP_USE_X87CONTROL
620  // restore FPCW
621  _control87(oldFpcw, _MCW_PC);
622 #endif
623  } // if
624  } else {
625  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
626  "kmp_sch_static_greedy\n",
627  gtid));
628  schedule = kmp_sch_static_greedy;
629  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
630  pr->u.p.parm1 = tc;
631  } // if
632  } // case
633  break;
634  case kmp_sch_static_greedy:
635  KD_TRACE(
636  100,
637  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
638  gtid));
639  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
640  break;
641  case kmp_sch_static_chunked:
642  case kmp_sch_dynamic_chunked:
643  if (pr->u.p.parm1 <= 0) {
644  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
645  }
646  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
647  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
648  gtid));
649  break;
650  case kmp_sch_trapezoidal: {
651  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
652 
653  T parm1, parm2, parm3, parm4;
654  KD_TRACE(100,
655  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
656  gtid));
657 
658  parm1 = chunk;
659 
660  /* F : size of the first cycle */
661  parm2 = (tc / (2 * nproc));
662 
663  if (parm2 < 1) {
664  parm2 = 1;
665  }
666 
667  /* L : size of the last cycle. Make sure the last cycle is not larger
668  than the first cycle. */
669  if (parm1 < 1) {
670  parm1 = 1;
671  } else if (parm1 > parm2) {
672  parm1 = parm2;
673  }
674 
675  /* N : number of cycles */
676  parm3 = (parm2 + parm1);
677  parm3 = (2 * tc + parm3 - 1) / parm3;
678 
679  if (parm3 < 2) {
680  parm3 = 2;
681  }
682 
683  /* sigma : decreasing incr of the trapezoid */
684  parm4 = (parm3 - 1);
685  parm4 = (parm2 - parm1) / parm4;
686 
687  // pointless check, because parm4 >= 0 always
688  // if ( parm4 < 0 ) {
689  // parm4 = 0;
690  //}
691 
692  pr->u.p.parm1 = parm1;
693  pr->u.p.parm2 = parm2;
694  pr->u.p.parm3 = parm3;
695  pr->u.p.parm4 = parm4;
696  } // case
697  break;
698 
699  default: {
700  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
701  KMP_HNT(GetNewerLibrary), // Hint
702  __kmp_msg_null // Variadic argument list terminator
703  );
704  } break;
705  } // switch
706  pr->schedule = schedule;
707 }
708 
709 #if KMP_USE_HIER_SCHED
710 template <typename T>
711 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
712  typename traits_t<T>::signed_t st);
713 template <>
714 inline void
715 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
716  kmp_int32 ub, kmp_int32 st) {
717  __kmp_dispatch_init_hierarchy<kmp_int32>(
718  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
719  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
720 }
721 template <>
722 inline void
723 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
724  kmp_uint32 ub, kmp_int32 st) {
725  __kmp_dispatch_init_hierarchy<kmp_uint32>(
726  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
727  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
728 }
729 template <>
730 inline void
731 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
732  kmp_int64 ub, kmp_int64 st) {
733  __kmp_dispatch_init_hierarchy<kmp_int64>(
734  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
735  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
736 }
737 template <>
738 inline void
739 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
740  kmp_uint64 ub, kmp_int64 st) {
741  __kmp_dispatch_init_hierarchy<kmp_uint64>(
742  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
743  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
744 }
745 
746 // free all the hierarchy scheduling memory associated with the team
747 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
748  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
749  for (int i = 0; i < num_disp_buff; ++i) {
750  // type does not matter here so use kmp_int32
751  auto sh =
752  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
753  &team->t.t_disp_buffer[i]);
754  if (sh->hier) {
755  sh->hier->deallocate();
756  __kmp_free(sh->hier);
757  }
758  }
759 }
760 #endif
761 
762 // UT - unsigned flavor of T, ST - signed flavor of T,
763 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
764 template <typename T>
765 static void
766 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
767  T ub, typename traits_t<T>::signed_t st,
768  typename traits_t<T>::signed_t chunk, int push_ws) {
769  typedef typename traits_t<T>::unsigned_t UT;
770 
771  int active;
772  kmp_info_t *th;
773  kmp_team_t *team;
774  kmp_uint32 my_buffer_index;
775  dispatch_private_info_template<T> *pr;
776  dispatch_shared_info_template<T> volatile *sh;
777 
778  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
779  sizeof(dispatch_private_info));
780  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
781  sizeof(dispatch_shared_info));
782 
783  if (!TCR_4(__kmp_init_parallel))
784  __kmp_parallel_initialize();
785 
786 #if OMP_50_ENABLED
787  __kmp_resume_if_soft_paused();
788 #endif
789 
790 #if INCLUDE_SSC_MARKS
791  SSC_MARK_DISPATCH_INIT();
792 #endif
793 #ifdef KMP_DEBUG
794  typedef typename traits_t<T>::signed_t ST;
795  {
796  char *buff;
797  // create format specifiers before the debug output
798  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
799  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
800  traits_t<ST>::spec, traits_t<T>::spec,
801  traits_t<T>::spec, traits_t<ST>::spec);
802  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
803  __kmp_str_free(&buff);
804  }
805 #endif
806  /* setup data */
807  th = __kmp_threads[gtid];
808  team = th->th.th_team;
809  active = !team->t.t_serialized;
810  th->th.th_ident = loc;
811 
812  // Any half-decent optimizer will remove this test when the blocks are empty
813  // since the macros expand to nothing
814  // when statistics are disabled.
815  if (schedule == __kmp_static) {
816  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
817  } else {
818  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
819  }
820 
821 #if KMP_USE_HIER_SCHED
822  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
823  // Hierarchical scheduling does not work with ordered, so if ordered is
824  // detected, then revert back to threaded scheduling.
825  bool ordered;
826  enum sched_type my_sched = schedule;
827  my_buffer_index = th->th.th_dispatch->th_disp_index;
828  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
829  &th->th.th_dispatch
830  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
831  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
832  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
833  my_sched =
834  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
835  ordered = (kmp_ord_lower & my_sched);
836  if (pr->flags.use_hier) {
837  if (ordered) {
838  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
839  "Disabling hierarchical scheduling.\n",
840  gtid));
841  pr->flags.use_hier = FALSE;
842  }
843  }
844  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
845  // Don't use hierarchical for ordered parallel loops and don't
846  // use the runtime hierarchy if one was specified in the program
847  if (!ordered && !pr->flags.use_hier)
848  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
849  }
850 #endif // KMP_USE_HIER_SCHED
851 
852 #if USE_ITT_BUILD
853  kmp_uint64 cur_chunk = chunk;
854  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
855  __kmp_forkjoin_frames_mode == 3 &&
856  KMP_MASTER_GTID(gtid) &&
857 #if OMP_40_ENABLED
858  th->th.th_teams_microtask == NULL &&
859 #endif
860  team->t.t_active_level == 1;
861 #endif
862  if (!active) {
863  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
864  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
865  } else {
866  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
867  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
868 
869  my_buffer_index = th->th.th_dispatch->th_disp_index++;
870 
871  /* What happens when number of threads changes, need to resize buffer? */
872  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
873  &th->th.th_dispatch
874  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
876  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
877  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
878  my_buffer_index));
879  }
880 
881  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
882 #if USE_ITT_BUILD
883  &cur_chunk,
884 #endif
885  chunk, (T)th->th.th_team_nproc,
886  (T)th->th.th_info.ds.ds_tid);
887  if (active) {
888  if (pr->flags.ordered == 0) {
889  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
890  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
891  } else {
892  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
893  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
894  }
895  }
896 
897  if (active) {
898  /* The name of this buffer should be my_buffer_index when it's free to use
899  * it */
900 
901  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
902  "sh->buffer_index:%d\n",
903  gtid, my_buffer_index, sh->buffer_index));
904  __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
905  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
906  // Note: KMP_WAIT() cannot be used there: buffer index and
907  // my_buffer_index are *always* 32-bit integers.
908  KMP_MB(); /* is this necessary? */
909  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
910  "sh->buffer_index:%d\n",
911  gtid, my_buffer_index, sh->buffer_index));
912 
913  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
914  th->th.th_dispatch->th_dispatch_sh_current =
915  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
916 #if USE_ITT_BUILD
917  if (pr->flags.ordered) {
918  __kmp_itt_ordered_init(gtid);
919  }
920  // Report loop metadata
921  if (itt_need_metadata_reporting) {
922  // Only report metadata by master of active team at level 1
923  kmp_uint64 schedtype = 0;
924  switch (schedule) {
925  case kmp_sch_static_chunked:
926  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
927  break;
928  case kmp_sch_static_greedy:
929  cur_chunk = pr->u.p.parm1;
930  break;
931  case kmp_sch_dynamic_chunked:
932  schedtype = 1;
933  break;
934  case kmp_sch_guided_iterative_chunked:
935  case kmp_sch_guided_analytical_chunked:
936 #if OMP_45_ENABLED
937  case kmp_sch_guided_simd:
938 #endif
939  schedtype = 2;
940  break;
941  default:
942  // Should we put this case under "static"?
943  // case kmp_sch_static_steal:
944  schedtype = 3;
945  break;
946  }
947  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
948  }
949 #if KMP_USE_HIER_SCHED
950  if (pr->flags.use_hier) {
951  pr->u.p.count = 0;
952  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
953  }
954 #endif // KMP_USER_HIER_SCHED
955 #endif /* USE_ITT_BUILD */
956  }
957 
958 #ifdef KMP_DEBUG
959  {
960  char *buff;
961  // create format specifiers before the debug output
962  buff = __kmp_str_format(
963  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
964  "lb:%%%s ub:%%%s"
965  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
966  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
967  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
968  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
969  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
970  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
971  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
972  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
973  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
974  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
975  __kmp_str_free(&buff);
976  }
977 #endif
978 #if (KMP_STATIC_STEAL_ENABLED)
979  // It cannot be guaranteed that after execution of a loop with some other
980  // schedule kind all the parm3 variables will contain the same value. Even if
981  // all parm3 will be the same, it still exists a bad case like using 0 and 1
982  // rather than program life-time increment. So the dedicated variable is
983  // required. The 'static_steal_counter' is used.
984  if (schedule == kmp_sch_static_steal) {
985  // Other threads will inspect this variable when searching for a victim.
986  // This is a flag showing that other threads may steal from this thread
987  // since then.
988  volatile T *p = &pr->u.p.static_steal_counter;
989  *p = *p + 1;
990  }
991 #endif // ( KMP_STATIC_STEAL_ENABLED )
992 
993 #if OMPT_SUPPORT && OMPT_OPTIONAL
994  if (ompt_enabled.ompt_callback_work) {
995  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
996  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
997  ompt_callbacks.ompt_callback(ompt_callback_work)(
998  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
999  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1000  }
1001 #endif
1002  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1003 }
1004 
1005 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1006  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1007  * every chunk of iterations. If the ordered section(s) were not executed
1008  * for this iteration (or every iteration in this chunk), we need to set the
1009  * ordered iteration counters so that the next thread can proceed. */
1010 template <typename UT>
1011 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1012  typedef typename traits_t<UT>::signed_t ST;
1013  kmp_info_t *th = __kmp_threads[gtid];
1014 
1015  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1016  if (!th->th.th_team->t.t_serialized) {
1017 
1018  dispatch_private_info_template<UT> *pr =
1019  reinterpret_cast<dispatch_private_info_template<UT> *>(
1020  th->th.th_dispatch->th_dispatch_pr_current);
1021  dispatch_shared_info_template<UT> volatile *sh =
1022  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1023  th->th.th_dispatch->th_dispatch_sh_current);
1024  KMP_DEBUG_ASSERT(pr);
1025  KMP_DEBUG_ASSERT(sh);
1026  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1027  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1028 
1029  if (pr->ordered_bumped) {
1030  KD_TRACE(
1031  1000,
1032  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1033  gtid));
1034  pr->ordered_bumped = 0;
1035  } else {
1036  UT lower = pr->u.p.ordered_lower;
1037 
1038 #ifdef KMP_DEBUG
1039  {
1040  char *buff;
1041  // create format specifiers before the debug output
1042  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1043  "ordered_iteration:%%%s lower:%%%s\n",
1044  traits_t<UT>::spec, traits_t<UT>::spec);
1045  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1046  __kmp_str_free(&buff);
1047  }
1048 #endif
1049 
1050  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1051  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1052  KMP_MB(); /* is this necessary? */
1053 #ifdef KMP_DEBUG
1054  {
1055  char *buff;
1056  // create format specifiers before the debug output
1057  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1058  "ordered_iteration:%%%s lower:%%%s\n",
1059  traits_t<UT>::spec, traits_t<UT>::spec);
1060  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1061  __kmp_str_free(&buff);
1062  }
1063 #endif
1064 
1065  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1066  } // if
1067  } // if
1068  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1069 }
1070 
1071 #ifdef KMP_GOMP_COMPAT
1072 
1073 template <typename UT>
1074 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1075  typedef typename traits_t<UT>::signed_t ST;
1076  kmp_info_t *th = __kmp_threads[gtid];
1077 
1078  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1079  if (!th->th.th_team->t.t_serialized) {
1080  // int cid;
1081  dispatch_private_info_template<UT> *pr =
1082  reinterpret_cast<dispatch_private_info_template<UT> *>(
1083  th->th.th_dispatch->th_dispatch_pr_current);
1084  dispatch_shared_info_template<UT> volatile *sh =
1085  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1086  th->th.th_dispatch->th_dispatch_sh_current);
1087  KMP_DEBUG_ASSERT(pr);
1088  KMP_DEBUG_ASSERT(sh);
1089  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1090  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1091 
1092  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1093  UT lower = pr->u.p.ordered_lower;
1094  UT upper = pr->u.p.ordered_upper;
1095  UT inc = upper - lower + 1;
1096 
1097  if (pr->ordered_bumped == inc) {
1098  KD_TRACE(
1099  1000,
1100  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1101  gtid));
1102  pr->ordered_bumped = 0;
1103  } else {
1104  inc -= pr->ordered_bumped;
1105 
1106 #ifdef KMP_DEBUG
1107  {
1108  char *buff;
1109  // create format specifiers before the debug output
1110  buff = __kmp_str_format(
1111  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1112  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1113  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1114  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1115  __kmp_str_free(&buff);
1116  }
1117 #endif
1118 
1119  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1120  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1121 
1122  KMP_MB(); /* is this necessary? */
1123  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1124  "ordered_bumped to zero\n",
1125  gtid));
1126  pr->ordered_bumped = 0;
1128 #ifdef KMP_DEBUG
1129  {
1130  char *buff;
1131  // create format specifiers before the debug output
1132  buff = __kmp_str_format(
1133  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1134  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1135  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1136  traits_t<UT>::spec);
1137  KD_TRACE(1000,
1138  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1139  __kmp_str_free(&buff);
1140  }
1141 #endif
1142 
1143  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1144  }
1145  // }
1146  }
1147  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1148 }
1149 
1150 #endif /* KMP_GOMP_COMPAT */
1151 
1152 template <typename T>
1153 int __kmp_dispatch_next_algorithm(int gtid,
1154  dispatch_private_info_template<T> *pr,
1155  dispatch_shared_info_template<T> volatile *sh,
1156  kmp_int32 *p_last, T *p_lb, T *p_ub,
1157  typename traits_t<T>::signed_t *p_st, T nproc,
1158  T tid) {
1159  typedef typename traits_t<T>::unsigned_t UT;
1160  typedef typename traits_t<T>::signed_t ST;
1161  typedef typename traits_t<T>::floating_t DBL;
1162  int status = 0;
1163  kmp_int32 last = 0;
1164  T start;
1165  ST incr;
1166  UT limit, trip, init;
1167  kmp_info_t *th = __kmp_threads[gtid];
1168  kmp_team_t *team = th->th.th_team;
1169 
1170  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1171  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1172  KMP_DEBUG_ASSERT(pr);
1173  KMP_DEBUG_ASSERT(sh);
1174  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1175 #ifdef KMP_DEBUG
1176  {
1177  char *buff;
1178  // create format specifiers before the debug output
1179  buff =
1180  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1181  "sh:%%p nproc:%%%s tid:%%%s\n",
1182  traits_t<T>::spec, traits_t<T>::spec);
1183  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1184  __kmp_str_free(&buff);
1185  }
1186 #endif
1187 
1188  // zero trip count
1189  if (pr->u.p.tc == 0) {
1190  KD_TRACE(10,
1191  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1192  "zero status:%d\n",
1193  gtid, status));
1194  return 0;
1195  }
1196 
1197  switch (pr->schedule) {
1198 #if (KMP_STATIC_STEAL_ENABLED)
1199  case kmp_sch_static_steal: {
1200  T chunk = pr->u.p.parm1;
1201 
1202  KD_TRACE(100,
1203  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1204  gtid));
1205 
1206  trip = pr->u.p.tc - 1;
1207 
1208  if (traits_t<T>::type_size > 4) {
1209  // use lock for 8-byte and CAS for 4-byte induction
1210  // variable. TODO (optional): check and use 16-byte CAS
1211  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1212  KMP_DEBUG_ASSERT(lck != NULL);
1213  if (pr->u.p.count < (UT)pr->u.p.ub) {
1214  __kmp_acquire_lock(lck, gtid);
1215  // try to get own chunk of iterations
1216  init = (pr->u.p.count)++;
1217  status = (init < (UT)pr->u.p.ub);
1218  __kmp_release_lock(lck, gtid);
1219  } else {
1220  status = 0; // no own chunks
1221  }
1222  if (!status) { // try to steal
1223  kmp_info_t **other_threads = team->t.t_threads;
1224  int while_limit = pr->u.p.parm3;
1225  int while_index = 0;
1226  // TODO: algorithm of searching for a victim
1227  // should be cleaned up and measured
1228  while ((!status) && (while_limit != ++while_index)) {
1229  T remaining;
1230  T victimIdx = pr->u.p.parm4;
1231  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1232  dispatch_private_info_template<T> *victim =
1233  reinterpret_cast<dispatch_private_info_template<T> *>(
1234  other_threads[victimIdx]
1235  ->th.th_dispatch->th_dispatch_pr_current);
1236  while ((victim == NULL || victim == pr ||
1237  (*(volatile T *)&victim->u.p.static_steal_counter !=
1238  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1239  oldVictimIdx != victimIdx) {
1240  victimIdx = (victimIdx + 1) % nproc;
1241  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1242  other_threads[victimIdx]
1243  ->th.th_dispatch->th_dispatch_pr_current);
1244  }
1245  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1246  *(volatile T *)&pr->u.p.static_steal_counter)) {
1247  continue; // try once more (nproc attempts in total)
1248  // no victim is ready yet to participate in stealing
1249  // because all victims are still in kmp_init_dispatch
1250  }
1251  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1252  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1253  continue; // not enough chunks to steal, goto next victim
1254  }
1255 
1256  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1257  KMP_ASSERT(lck != NULL);
1258  __kmp_acquire_lock(lck, gtid);
1259  limit = victim->u.p.ub; // keep initial ub
1260  if (victim->u.p.count >= limit ||
1261  (remaining = limit - victim->u.p.count) < 2) {
1262  __kmp_release_lock(lck, gtid);
1263  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1264  continue; // not enough chunks to steal
1265  }
1266  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1267  // by 1
1268  if (remaining > 3) {
1269  // steal 1/4 of remaining
1270  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1271  init = (victim->u.p.ub -= (remaining >> 2));
1272  } else {
1273  // steal 1 chunk of 2 or 3 remaining
1274  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1275  init = (victim->u.p.ub -= 1);
1276  }
1277  __kmp_release_lock(lck, gtid);
1278 
1279  KMP_DEBUG_ASSERT(init + 1 <= limit);
1280  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1281  status = 1;
1282  while_index = 0;
1283  // now update own count and ub with stolen range but init chunk
1284  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1285  pr->u.p.count = init + 1;
1286  pr->u.p.ub = limit;
1287  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1288  } // while (search for victim)
1289  } // if (try to find victim and steal)
1290  } else {
1291  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1292  typedef union {
1293  struct {
1294  UT count;
1295  T ub;
1296  } p;
1297  kmp_int64 b;
1298  } union_i4;
1299  // All operations on 'count' or 'ub' must be combined atomically
1300  // together.
1301  {
1302  union_i4 vold, vnew;
1303  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1304  vnew = vold;
1305  vnew.p.count++;
1306  while (!KMP_COMPARE_AND_STORE_ACQ64(
1307  (volatile kmp_int64 *)&pr->u.p.count,
1308  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1309  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1310  KMP_CPU_PAUSE();
1311  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1312  vnew = vold;
1313  vnew.p.count++;
1314  }
1315  vnew = vold;
1316  init = vnew.p.count;
1317  status = (init < (UT)vnew.p.ub);
1318  }
1319 
1320  if (!status) {
1321  kmp_info_t **other_threads = team->t.t_threads;
1322  int while_limit = pr->u.p.parm3;
1323  int while_index = 0;
1324 
1325  // TODO: algorithm of searching for a victim
1326  // should be cleaned up and measured
1327  while ((!status) && (while_limit != ++while_index)) {
1328  union_i4 vold, vnew;
1329  kmp_int32 remaining;
1330  T victimIdx = pr->u.p.parm4;
1331  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1332  dispatch_private_info_template<T> *victim =
1333  reinterpret_cast<dispatch_private_info_template<T> *>(
1334  other_threads[victimIdx]
1335  ->th.th_dispatch->th_dispatch_pr_current);
1336  while ((victim == NULL || victim == pr ||
1337  (*(volatile T *)&victim->u.p.static_steal_counter !=
1338  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1339  oldVictimIdx != victimIdx) {
1340  victimIdx = (victimIdx + 1) % nproc;
1341  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1342  other_threads[victimIdx]
1343  ->th.th_dispatch->th_dispatch_pr_current);
1344  }
1345  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1346  *(volatile T *)&pr->u.p.static_steal_counter)) {
1347  continue; // try once more (nproc attempts in total)
1348  // no victim is ready yet to participate in stealing
1349  // because all victims are still in kmp_init_dispatch
1350  }
1351  pr->u.p.parm4 = victimIdx; // new victim found
1352  while (1) { // CAS loop if victim has enough chunks to steal
1353  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1354  vnew = vold;
1355 
1356  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1357  if (vnew.p.count >= (UT)vnew.p.ub ||
1358  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1359  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1360  break; // not enough chunks to steal, goto next victim
1361  }
1362  if (remaining > 3) {
1363  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1364  } else {
1365  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1366  }
1367  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1368  // TODO: Should this be acquire or release?
1369  if (KMP_COMPARE_AND_STORE_ACQ64(
1370  (volatile kmp_int64 *)&victim->u.p.count,
1371  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1372  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1373  // stealing succedded
1374  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1375  vold.p.ub - vnew.p.ub);
1376  status = 1;
1377  while_index = 0;
1378  // now update own count and ub
1379  init = vnew.p.ub;
1380  vold.p.count = init + 1;
1381 #if KMP_ARCH_X86
1382  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1383 #else
1384  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1385 #endif
1386  break;
1387  } // if (check CAS result)
1388  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1389  } // while (try to steal from particular victim)
1390  } // while (search for victim)
1391  } // if (try to find victim and steal)
1392  } // if (4-byte induction variable)
1393  if (!status) {
1394  *p_lb = 0;
1395  *p_ub = 0;
1396  if (p_st != NULL)
1397  *p_st = 0;
1398  } else {
1399  start = pr->u.p.parm2;
1400  init *= chunk;
1401  limit = chunk + init - 1;
1402  incr = pr->u.p.st;
1403  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1404 
1405  KMP_DEBUG_ASSERT(init <= trip);
1406  if ((last = (limit >= trip)) != 0)
1407  limit = trip;
1408  if (p_st != NULL)
1409  *p_st = incr;
1410 
1411  if (incr == 1) {
1412  *p_lb = start + init;
1413  *p_ub = start + limit;
1414  } else {
1415  *p_lb = start + init * incr;
1416  *p_ub = start + limit * incr;
1417  }
1418 
1419  if (pr->flags.ordered) {
1420  pr->u.p.ordered_lower = init;
1421  pr->u.p.ordered_upper = limit;
1422  } // if
1423  } // if
1424  break;
1425  } // case
1426 #endif // ( KMP_STATIC_STEAL_ENABLED )
1427  case kmp_sch_static_balanced: {
1428  KD_TRACE(
1429  10,
1430  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1431  gtid));
1432  /* check if thread has any iteration to do */
1433  if ((status = !pr->u.p.count) != 0) {
1434  pr->u.p.count = 1;
1435  *p_lb = pr->u.p.lb;
1436  *p_ub = pr->u.p.ub;
1437  last = pr->u.p.parm1;
1438  if (p_st != NULL)
1439  *p_st = pr->u.p.st;
1440  } else { /* no iterations to do */
1441  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1442  }
1443  } // case
1444  break;
1445  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1446  merged here */
1447  case kmp_sch_static_chunked: {
1448  T parm1;
1449 
1450  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1451  "kmp_sch_static_[affinity|chunked] case\n",
1452  gtid));
1453  parm1 = pr->u.p.parm1;
1454 
1455  trip = pr->u.p.tc - 1;
1456  init = parm1 * (pr->u.p.count + tid);
1457 
1458  if ((status = (init <= trip)) != 0) {
1459  start = pr->u.p.lb;
1460  incr = pr->u.p.st;
1461  limit = parm1 + init - 1;
1462 
1463  if ((last = (limit >= trip)) != 0)
1464  limit = trip;
1465 
1466  if (p_st != NULL)
1467  *p_st = incr;
1468 
1469  pr->u.p.count += nproc;
1470 
1471  if (incr == 1) {
1472  *p_lb = start + init;
1473  *p_ub = start + limit;
1474  } else {
1475  *p_lb = start + init * incr;
1476  *p_ub = start + limit * incr;
1477  }
1478 
1479  if (pr->flags.ordered) {
1480  pr->u.p.ordered_lower = init;
1481  pr->u.p.ordered_upper = limit;
1482  } // if
1483  } // if
1484  } // case
1485  break;
1486 
1487  case kmp_sch_dynamic_chunked: {
1488  T chunk = pr->u.p.parm1;
1489 
1490  KD_TRACE(
1491  100,
1492  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1493  gtid));
1494 
1495  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1496  trip = pr->u.p.tc - 1;
1497 
1498  if ((status = (init <= trip)) == 0) {
1499  *p_lb = 0;
1500  *p_ub = 0;
1501  if (p_st != NULL)
1502  *p_st = 0;
1503  } else {
1504  start = pr->u.p.lb;
1505  limit = chunk + init - 1;
1506  incr = pr->u.p.st;
1507 
1508  if ((last = (limit >= trip)) != 0)
1509  limit = trip;
1510 
1511  if (p_st != NULL)
1512  *p_st = incr;
1513 
1514  if (incr == 1) {
1515  *p_lb = start + init;
1516  *p_ub = start + limit;
1517  } else {
1518  *p_lb = start + init * incr;
1519  *p_ub = start + limit * incr;
1520  }
1521 
1522  if (pr->flags.ordered) {
1523  pr->u.p.ordered_lower = init;
1524  pr->u.p.ordered_upper = limit;
1525  } // if
1526  } // if
1527  } // case
1528  break;
1529 
1530  case kmp_sch_guided_iterative_chunked: {
1531  T chunkspec = pr->u.p.parm1;
1532  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1533  "iterative case\n",
1534  gtid));
1535  trip = pr->u.p.tc;
1536  // Start atomic part of calculations
1537  while (1) {
1538  ST remaining; // signed, because can be < 0
1539  init = sh->u.s.iteration; // shared value
1540  remaining = trip - init;
1541  if (remaining <= 0) { // AC: need to compare with 0 first
1542  // nothing to do, don't try atomic op
1543  status = 0;
1544  break;
1545  }
1546  if ((T)remaining <
1547  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1548  // use dynamic-style shcedule
1549  // atomically inrement iterations, get old value
1550  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1551  (ST)chunkspec);
1552  remaining = trip - init;
1553  if (remaining <= 0) {
1554  status = 0; // all iterations got by other threads
1555  } else {
1556  // got some iterations to work on
1557  status = 1;
1558  if ((T)remaining > chunkspec) {
1559  limit = init + chunkspec - 1;
1560  } else {
1561  last = 1; // the last chunk
1562  limit = init + remaining - 1;
1563  } // if
1564  } // if
1565  break;
1566  } // if
1567  limit = init +
1568  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1569  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1570  (ST)init, (ST)limit)) {
1571  // CAS was successful, chunk obtained
1572  status = 1;
1573  --limit;
1574  break;
1575  } // if
1576  } // while
1577  if (status != 0) {
1578  start = pr->u.p.lb;
1579  incr = pr->u.p.st;
1580  if (p_st != NULL)
1581  *p_st = incr;
1582  *p_lb = start + init * incr;
1583  *p_ub = start + limit * incr;
1584  if (pr->flags.ordered) {
1585  pr->u.p.ordered_lower = init;
1586  pr->u.p.ordered_upper = limit;
1587  } // if
1588  } else {
1589  *p_lb = 0;
1590  *p_ub = 0;
1591  if (p_st != NULL)
1592  *p_st = 0;
1593  } // if
1594  } // case
1595  break;
1596 
1597 #if OMP_45_ENABLED
1598  case kmp_sch_guided_simd: {
1599  // same as iterative but curr-chunk adjusted to be multiple of given
1600  // chunk
1601  T chunk = pr->u.p.parm1;
1602  KD_TRACE(100,
1603  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1604  gtid));
1605  trip = pr->u.p.tc;
1606  // Start atomic part of calculations
1607  while (1) {
1608  ST remaining; // signed, because can be < 0
1609  init = sh->u.s.iteration; // shared value
1610  remaining = trip - init;
1611  if (remaining <= 0) { // AC: need to compare with 0 first
1612  status = 0; // nothing to do, don't try atomic op
1613  break;
1614  }
1615  KMP_DEBUG_ASSERT(init % chunk == 0);
1616  // compare with K*nproc*(chunk+1), K=2 by default
1617  if ((T)remaining < pr->u.p.parm2) {
1618  // use dynamic-style shcedule
1619  // atomically inrement iterations, get old value
1620  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1621  (ST)chunk);
1622  remaining = trip - init;
1623  if (remaining <= 0) {
1624  status = 0; // all iterations got by other threads
1625  } else {
1626  // got some iterations to work on
1627  status = 1;
1628  if ((T)remaining > chunk) {
1629  limit = init + chunk - 1;
1630  } else {
1631  last = 1; // the last chunk
1632  limit = init + remaining - 1;
1633  } // if
1634  } // if
1635  break;
1636  } // if
1637  // divide by K*nproc
1638  UT span = remaining * (*(double *)&pr->u.p.parm3);
1639  UT rem = span % chunk;
1640  if (rem) // adjust so that span%chunk == 0
1641  span += chunk - rem;
1642  limit = init + span;
1643  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1644  (ST)init, (ST)limit)) {
1645  // CAS was successful, chunk obtained
1646  status = 1;
1647  --limit;
1648  break;
1649  } // if
1650  } // while
1651  if (status != 0) {
1652  start = pr->u.p.lb;
1653  incr = pr->u.p.st;
1654  if (p_st != NULL)
1655  *p_st = incr;
1656  *p_lb = start + init * incr;
1657  *p_ub = start + limit * incr;
1658  if (pr->flags.ordered) {
1659  pr->u.p.ordered_lower = init;
1660  pr->u.p.ordered_upper = limit;
1661  } // if
1662  } else {
1663  *p_lb = 0;
1664  *p_ub = 0;
1665  if (p_st != NULL)
1666  *p_st = 0;
1667  } // if
1668  } // case
1669  break;
1670 #endif // OMP_45_ENABLED
1671 
1672  case kmp_sch_guided_analytical_chunked: {
1673  T chunkspec = pr->u.p.parm1;
1674  UT chunkIdx;
1675 #if KMP_USE_X87CONTROL
1676  /* for storing original FPCW value for Windows* OS on
1677  IA-32 architecture 8-byte version */
1678  unsigned int oldFpcw;
1679  unsigned int fpcwSet = 0;
1680 #endif
1681  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1682  "kmp_sch_guided_analytical_chunked case\n",
1683  gtid));
1684 
1685  trip = pr->u.p.tc;
1686 
1687  KMP_DEBUG_ASSERT(nproc > 1);
1688  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1689 
1690  while (1) { /* this while loop is a safeguard against unexpected zero
1691  chunk sizes */
1692  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1693  if (chunkIdx >= (UT)pr->u.p.parm2) {
1694  --trip;
1695  /* use dynamic-style scheduling */
1696  init = chunkIdx * chunkspec + pr->u.p.count;
1697  /* need to verify init > 0 in case of overflow in the above
1698  * calculation */
1699  if ((status = (init > 0 && init <= trip)) != 0) {
1700  limit = init + chunkspec - 1;
1701 
1702  if ((last = (limit >= trip)) != 0)
1703  limit = trip;
1704  }
1705  break;
1706  } else {
1707 /* use exponential-style scheduling */
1708 /* The following check is to workaround the lack of long double precision on
1709  Windows* OS.
1710  This check works around the possible effect that init != 0 for chunkIdx == 0.
1711  */
1712 #if KMP_USE_X87CONTROL
1713  /* If we haven't already done so, save original
1714  FPCW and set precision to 64-bit, as Windows* OS
1715  on IA-32 architecture defaults to 53-bit */
1716  if (!fpcwSet) {
1717  oldFpcw = _control87(0, 0);
1718  _control87(_PC_64, _MCW_PC);
1719  fpcwSet = 0x30000;
1720  }
1721 #endif
1722  if (chunkIdx) {
1723  init = __kmp_dispatch_guided_remaining<T>(
1724  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1725  KMP_DEBUG_ASSERT(init);
1726  init = trip - init;
1727  } else
1728  init = 0;
1729  limit = trip - __kmp_dispatch_guided_remaining<T>(
1730  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1731  KMP_ASSERT(init <= limit);
1732  if (init < limit) {
1733  KMP_DEBUG_ASSERT(limit <= trip);
1734  --limit;
1735  status = 1;
1736  break;
1737  } // if
1738  } // if
1739  } // while (1)
1740 #if KMP_USE_X87CONTROL
1741  /* restore FPCW if necessary
1742  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1743  */
1744  if (fpcwSet && (oldFpcw & fpcwSet))
1745  _control87(oldFpcw, _MCW_PC);
1746 #endif
1747  if (status != 0) {
1748  start = pr->u.p.lb;
1749  incr = pr->u.p.st;
1750  if (p_st != NULL)
1751  *p_st = incr;
1752  *p_lb = start + init * incr;
1753  *p_ub = start + limit * incr;
1754  if (pr->flags.ordered) {
1755  pr->u.p.ordered_lower = init;
1756  pr->u.p.ordered_upper = limit;
1757  }
1758  } else {
1759  *p_lb = 0;
1760  *p_ub = 0;
1761  if (p_st != NULL)
1762  *p_st = 0;
1763  }
1764  } // case
1765  break;
1766 
1767  case kmp_sch_trapezoidal: {
1768  UT index;
1769  T parm2 = pr->u.p.parm2;
1770  T parm3 = pr->u.p.parm3;
1771  T parm4 = pr->u.p.parm4;
1772  KD_TRACE(100,
1773  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1774  gtid));
1775 
1776  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1777 
1778  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1779  trip = pr->u.p.tc - 1;
1780 
1781  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1782  *p_lb = 0;
1783  *p_ub = 0;
1784  if (p_st != NULL)
1785  *p_st = 0;
1786  } else {
1787  start = pr->u.p.lb;
1788  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1789  incr = pr->u.p.st;
1790 
1791  if ((last = (limit >= trip)) != 0)
1792  limit = trip;
1793 
1794  if (p_st != NULL)
1795  *p_st = incr;
1796 
1797  if (incr == 1) {
1798  *p_lb = start + init;
1799  *p_ub = start + limit;
1800  } else {
1801  *p_lb = start + init * incr;
1802  *p_ub = start + limit * incr;
1803  }
1804 
1805  if (pr->flags.ordered) {
1806  pr->u.p.ordered_lower = init;
1807  pr->u.p.ordered_upper = limit;
1808  } // if
1809  } // if
1810  } // case
1811  break;
1812  default: {
1813  status = 0; // to avoid complaints on uninitialized variable use
1814  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1815  KMP_HNT(GetNewerLibrary), // Hint
1816  __kmp_msg_null // Variadic argument list terminator
1817  );
1818  } break;
1819  } // switch
1820  if (p_last)
1821  *p_last = last;
1822 #ifdef KMP_DEBUG
1823  if (pr->flags.ordered) {
1824  char *buff;
1825  // create format specifiers before the debug output
1826  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1827  "ordered_lower:%%%s ordered_upper:%%%s\n",
1828  traits_t<UT>::spec, traits_t<UT>::spec);
1829  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1830  __kmp_str_free(&buff);
1831  }
1832  {
1833  char *buff;
1834  // create format specifiers before the debug output
1835  buff = __kmp_str_format(
1836  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1837  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1838  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1839  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1840  __kmp_str_free(&buff);
1841  }
1842 #endif
1843  return status;
1844 }
1845 
1846 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1847  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1848  is not called. */
1849 #if OMPT_SUPPORT && OMPT_OPTIONAL
1850 #define OMPT_LOOP_END \
1851  if (status == 0) { \
1852  if (ompt_enabled.ompt_callback_work) { \
1853  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1854  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1855  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1856  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1857  &(task_info->task_data), 0, codeptr); \
1858  } \
1859  }
1860 // TODO: implement count
1861 #else
1862 #define OMPT_LOOP_END // no-op
1863 #endif
1864 
1865 #if KMP_STATS_ENABLED
1866 #define KMP_STATS_LOOP_END \
1867  { \
1868  kmp_int64 u, l, t, i; \
1869  l = (kmp_int64)(*p_lb); \
1870  u = (kmp_int64)(*p_ub); \
1871  i = (kmp_int64)(pr->u.p.st); \
1872  if (status == 0) { \
1873  t = 0; \
1874  KMP_POP_PARTITIONED_TIMER(); \
1875  } else if (i == 1) { \
1876  if (u >= l) \
1877  t = u - l + 1; \
1878  else \
1879  t = 0; \
1880  } else if (i < 0) { \
1881  if (l >= u) \
1882  t = (l - u) / (-i) + 1; \
1883  else \
1884  t = 0; \
1885  } else { \
1886  if (u >= l) \
1887  t = (u - l) / i + 1; \
1888  else \
1889  t = 0; \
1890  } \
1891  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1892  }
1893 #else
1894 #define KMP_STATS_LOOP_END /* Nothing */
1895 #endif
1896 
1897 template <typename T>
1898 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1899  T *p_lb, T *p_ub,
1900  typename traits_t<T>::signed_t *p_st
1901 #if OMPT_SUPPORT && OMPT_OPTIONAL
1902  ,
1903  void *codeptr
1904 #endif
1905  ) {
1906 
1907  typedef typename traits_t<T>::unsigned_t UT;
1908  typedef typename traits_t<T>::signed_t ST;
1909  // This is potentially slightly misleading, schedule(runtime) will appear here
1910  // even if the actual runtme schedule is static. (Which points out a
1911  // disadavantage of schedule(runtime): even when static scheduling is used it
1912  // costs more than a compile time choice to use static scheduling would.)
1913  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1914 
1915  int status;
1916  dispatch_private_info_template<T> *pr;
1917  kmp_info_t *th = __kmp_threads[gtid];
1918  kmp_team_t *team = th->th.th_team;
1919 
1920  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1921  KD_TRACE(
1922  1000,
1923  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1924  gtid, p_lb, p_ub, p_st, p_last));
1925 
1926  if (team->t.t_serialized) {
1927  /* NOTE: serialize this dispatch becase we are not at the active level */
1928  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1929  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1930  KMP_DEBUG_ASSERT(pr);
1931 
1932  if ((status = (pr->u.p.tc != 0)) == 0) {
1933  *p_lb = 0;
1934  *p_ub = 0;
1935  // if ( p_last != NULL )
1936  // *p_last = 0;
1937  if (p_st != NULL)
1938  *p_st = 0;
1939  if (__kmp_env_consistency_check) {
1940  if (pr->pushed_ws != ct_none) {
1941  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1942  }
1943  }
1944  } else if (pr->flags.nomerge) {
1945  kmp_int32 last;
1946  T start;
1947  UT limit, trip, init;
1948  ST incr;
1949  T chunk = pr->u.p.parm1;
1950 
1951  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1952  gtid));
1953 
1954  init = chunk * pr->u.p.count++;
1955  trip = pr->u.p.tc - 1;
1956 
1957  if ((status = (init <= trip)) == 0) {
1958  *p_lb = 0;
1959  *p_ub = 0;
1960  // if ( p_last != NULL )
1961  // *p_last = 0;
1962  if (p_st != NULL)
1963  *p_st = 0;
1964  if (__kmp_env_consistency_check) {
1965  if (pr->pushed_ws != ct_none) {
1966  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1967  }
1968  }
1969  } else {
1970  start = pr->u.p.lb;
1971  limit = chunk + init - 1;
1972  incr = pr->u.p.st;
1973 
1974  if ((last = (limit >= trip)) != 0) {
1975  limit = trip;
1976 #if KMP_OS_WINDOWS
1977  pr->u.p.last_upper = pr->u.p.ub;
1978 #endif /* KMP_OS_WINDOWS */
1979  }
1980  if (p_last != NULL)
1981  *p_last = last;
1982  if (p_st != NULL)
1983  *p_st = incr;
1984  if (incr == 1) {
1985  *p_lb = start + init;
1986  *p_ub = start + limit;
1987  } else {
1988  *p_lb = start + init * incr;
1989  *p_ub = start + limit * incr;
1990  }
1991 
1992  if (pr->flags.ordered) {
1993  pr->u.p.ordered_lower = init;
1994  pr->u.p.ordered_upper = limit;
1995 #ifdef KMP_DEBUG
1996  {
1997  char *buff;
1998  // create format specifiers before the debug output
1999  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2000  "ordered_lower:%%%s ordered_upper:%%%s\n",
2001  traits_t<UT>::spec, traits_t<UT>::spec);
2002  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2003  pr->u.p.ordered_upper));
2004  __kmp_str_free(&buff);
2005  }
2006 #endif
2007  } // if
2008  } // if
2009  } else {
2010  pr->u.p.tc = 0;
2011  *p_lb = pr->u.p.lb;
2012  *p_ub = pr->u.p.ub;
2013 #if KMP_OS_WINDOWS
2014  pr->u.p.last_upper = *p_ub;
2015 #endif /* KMP_OS_WINDOWS */
2016  if (p_last != NULL)
2017  *p_last = TRUE;
2018  if (p_st != NULL)
2019  *p_st = pr->u.p.st;
2020  } // if
2021 #ifdef KMP_DEBUG
2022  {
2023  char *buff;
2024  // create format specifiers before the debug output
2025  buff = __kmp_str_format(
2026  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2027  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2028  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2029  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2030  __kmp_str_free(&buff);
2031  }
2032 #endif
2033 #if INCLUDE_SSC_MARKS
2034  SSC_MARK_DISPATCH_NEXT();
2035 #endif
2036  OMPT_LOOP_END;
2037  KMP_STATS_LOOP_END;
2038  return status;
2039  } else {
2040  kmp_int32 last = 0;
2041  dispatch_shared_info_template<T> volatile *sh;
2042 
2043  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2044  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2045 
2046  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2047  th->th.th_dispatch->th_dispatch_pr_current);
2048  KMP_DEBUG_ASSERT(pr);
2049  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2050  th->th.th_dispatch->th_dispatch_sh_current);
2051  KMP_DEBUG_ASSERT(sh);
2052 
2053 #if KMP_USE_HIER_SCHED
2054  if (pr->flags.use_hier)
2055  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2056  else
2057 #endif // KMP_USE_HIER_SCHED
2058  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2059  p_st, th->th.th_team_nproc,
2060  th->th.th_info.ds.ds_tid);
2061  // status == 0: no more iterations to execute
2062  if (status == 0) {
2063  UT num_done;
2064 
2065  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2066 #ifdef KMP_DEBUG
2067  {
2068  char *buff;
2069  // create format specifiers before the debug output
2070  buff = __kmp_str_format(
2071  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2072  traits_t<UT>::spec);
2073  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2074  __kmp_str_free(&buff);
2075  }
2076 #endif
2077 
2078 #if KMP_USE_HIER_SCHED
2079  pr->flags.use_hier = FALSE;
2080 #endif
2081  if ((ST)num_done == th->th.th_team_nproc - 1) {
2082 #if (KMP_STATIC_STEAL_ENABLED)
2083  if (pr->schedule == kmp_sch_static_steal &&
2084  traits_t<T>::type_size > 4) {
2085  int i;
2086  kmp_info_t **other_threads = team->t.t_threads;
2087  // loop complete, safe to destroy locks used for stealing
2088  for (i = 0; i < th->th.th_team_nproc; ++i) {
2089  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2090  KMP_ASSERT(lck != NULL);
2091  __kmp_destroy_lock(lck);
2092  __kmp_free(lck);
2093  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2094  }
2095  }
2096 #endif
2097  /* NOTE: release this buffer to be reused */
2098 
2099  KMP_MB(); /* Flush all pending memory write invalidates. */
2100 
2101  sh->u.s.num_done = 0;
2102  sh->u.s.iteration = 0;
2103 
2104  /* TODO replace with general release procedure? */
2105  if (pr->flags.ordered) {
2106  sh->u.s.ordered_iteration = 0;
2107  }
2108 
2109  KMP_MB(); /* Flush all pending memory write invalidates. */
2110 
2111  sh->buffer_index += __kmp_dispatch_num_buffers;
2112  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2113  gtid, sh->buffer_index));
2114 
2115  KMP_MB(); /* Flush all pending memory write invalidates. */
2116 
2117  } // if
2118  if (__kmp_env_consistency_check) {
2119  if (pr->pushed_ws != ct_none) {
2120  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2121  }
2122  }
2123 
2124  th->th.th_dispatch->th_deo_fcn = NULL;
2125  th->th.th_dispatch->th_dxo_fcn = NULL;
2126  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2127  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2128  } // if (status == 0)
2129 #if KMP_OS_WINDOWS
2130  else if (last) {
2131  pr->u.p.last_upper = pr->u.p.ub;
2132  }
2133 #endif /* KMP_OS_WINDOWS */
2134  if (p_last != NULL && status != 0)
2135  *p_last = last;
2136  } // if
2137 
2138 #ifdef KMP_DEBUG
2139  {
2140  char *buff;
2141  // create format specifiers before the debug output
2142  buff = __kmp_str_format(
2143  "__kmp_dispatch_next: T#%%d normal case: "
2144  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2145  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2146  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2147  (p_last ? *p_last : 0), status));
2148  __kmp_str_free(&buff);
2149  }
2150 #endif
2151 #if INCLUDE_SSC_MARKS
2152  SSC_MARK_DISPATCH_NEXT();
2153 #endif
2154  OMPT_LOOP_END;
2155  KMP_STATS_LOOP_END;
2156  return status;
2157 }
2158 
2159 template <typename T>
2160 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2161  kmp_int32 *plastiter, T *plower, T *pupper,
2162  typename traits_t<T>::signed_t incr) {
2163  typedef typename traits_t<T>::unsigned_t UT;
2164  kmp_uint32 team_id;
2165  kmp_uint32 nteams;
2166  UT trip_count;
2167  kmp_team_t *team;
2168  kmp_info_t *th;
2169 
2170  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2171  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2172 #ifdef KMP_DEBUG
2173  typedef typename traits_t<T>::signed_t ST;
2174  {
2175  char *buff;
2176  // create format specifiers before the debug output
2177  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2178  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2179  traits_t<T>::spec, traits_t<T>::spec,
2180  traits_t<ST>::spec, traits_t<T>::spec);
2181  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2182  __kmp_str_free(&buff);
2183  }
2184 #endif
2185 
2186  if (__kmp_env_consistency_check) {
2187  if (incr == 0) {
2188  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2189  loc);
2190  }
2191  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2192  // The loop is illegal.
2193  // Some zero-trip loops maintained by compiler, e.g.:
2194  // for(i=10;i<0;++i) // lower >= upper - run-time check
2195  // for(i=0;i>10;--i) // lower <= upper - run-time check
2196  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2197  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2198  // Compiler does not check the following illegal loops:
2199  // for(i=0;i<10;i+=incr) // where incr<0
2200  // for(i=10;i>0;i-=incr) // where incr<0
2201  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2202  }
2203  }
2204  th = __kmp_threads[gtid];
2205  team = th->th.th_team;
2206 #if OMP_40_ENABLED
2207  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2208  nteams = th->th.th_teams_size.nteams;
2209 #endif
2210  team_id = team->t.t_master_tid;
2211  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2212 
2213  // compute global trip count
2214  if (incr == 1) {
2215  trip_count = *pupper - *plower + 1;
2216  } else if (incr == -1) {
2217  trip_count = *plower - *pupper + 1;
2218  } else if (incr > 0) {
2219  // upper-lower can exceed the limit of signed type
2220  trip_count = (UT)(*pupper - *plower) / incr + 1;
2221  } else {
2222  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2223  }
2224 
2225  if (trip_count <= nteams) {
2226  KMP_DEBUG_ASSERT(
2227  __kmp_static == kmp_sch_static_greedy ||
2228  __kmp_static ==
2229  kmp_sch_static_balanced); // Unknown static scheduling type.
2230  // only some teams get single iteration, others get nothing
2231  if (team_id < trip_count) {
2232  *pupper = *plower = *plower + team_id * incr;
2233  } else {
2234  *plower = *pupper + incr; // zero-trip loop
2235  }
2236  if (plastiter != NULL)
2237  *plastiter = (team_id == trip_count - 1);
2238  } else {
2239  if (__kmp_static == kmp_sch_static_balanced) {
2240  UT chunk = trip_count / nteams;
2241  UT extras = trip_count % nteams;
2242  *plower +=
2243  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2244  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2245  if (plastiter != NULL)
2246  *plastiter = (team_id == nteams - 1);
2247  } else {
2248  T chunk_inc_count =
2249  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2250  T upper = *pupper;
2251  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2252  // Unknown static scheduling type.
2253  *plower += team_id * chunk_inc_count;
2254  *pupper = *plower + chunk_inc_count - incr;
2255  // Check/correct bounds if needed
2256  if (incr > 0) {
2257  if (*pupper < *plower)
2258  *pupper = traits_t<T>::max_value;
2259  if (plastiter != NULL)
2260  *plastiter = *plower <= upper && *pupper > upper - incr;
2261  if (*pupper > upper)
2262  *pupper = upper; // tracker C73258
2263  } else {
2264  if (*pupper > *plower)
2265  *pupper = traits_t<T>::min_value;
2266  if (plastiter != NULL)
2267  *plastiter = *plower >= upper && *pupper < upper - incr;
2268  if (*pupper < upper)
2269  *pupper = upper; // tracker C73258
2270  }
2271  }
2272  }
2273 }
2274 
2275 //-----------------------------------------------------------------------------
2276 // Dispatch routines
2277 // Transfer call to template< type T >
2278 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2279 // T lb, T ub, ST st, ST chunk )
2280 extern "C" {
2281 
2298 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2299  enum sched_type schedule, kmp_int32 lb,
2300  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2301  KMP_DEBUG_ASSERT(__kmp_init_serial);
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL
2303  OMPT_STORE_RETURN_ADDRESS(gtid);
2304 #endif
2305  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2306 }
2310 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2311  enum sched_type schedule, kmp_uint32 lb,
2312  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2313  KMP_DEBUG_ASSERT(__kmp_init_serial);
2314 #if OMPT_SUPPORT && OMPT_OPTIONAL
2315  OMPT_STORE_RETURN_ADDRESS(gtid);
2316 #endif
2317  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2318 }
2319 
2323 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2324  enum sched_type schedule, kmp_int64 lb,
2325  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2326  KMP_DEBUG_ASSERT(__kmp_init_serial);
2327 #if OMPT_SUPPORT && OMPT_OPTIONAL
2328  OMPT_STORE_RETURN_ADDRESS(gtid);
2329 #endif
2330  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2331 }
2332 
2336 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2337  enum sched_type schedule, kmp_uint64 lb,
2338  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2339  KMP_DEBUG_ASSERT(__kmp_init_serial);
2340 #if OMPT_SUPPORT && OMPT_OPTIONAL
2341  OMPT_STORE_RETURN_ADDRESS(gtid);
2342 #endif
2343  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2344 }
2345 
2355 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2356  enum sched_type schedule, kmp_int32 *p_last,
2357  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2358  kmp_int32 chunk) {
2359  KMP_DEBUG_ASSERT(__kmp_init_serial);
2360 #if OMPT_SUPPORT && OMPT_OPTIONAL
2361  OMPT_STORE_RETURN_ADDRESS(gtid);
2362 #endif
2363  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2364  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2365 }
2366 
2367 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2368  enum sched_type schedule, kmp_int32 *p_last,
2369  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2370  kmp_int32 chunk) {
2371  KMP_DEBUG_ASSERT(__kmp_init_serial);
2372 #if OMPT_SUPPORT && OMPT_OPTIONAL
2373  OMPT_STORE_RETURN_ADDRESS(gtid);
2374 #endif
2375  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2376  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2377 }
2378 
2379 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2380  enum sched_type schedule, kmp_int32 *p_last,
2381  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2382  kmp_int64 chunk) {
2383  KMP_DEBUG_ASSERT(__kmp_init_serial);
2384 #if OMPT_SUPPORT && OMPT_OPTIONAL
2385  OMPT_STORE_RETURN_ADDRESS(gtid);
2386 #endif
2387  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2388  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2389 }
2390 
2391 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2392  enum sched_type schedule, kmp_int32 *p_last,
2393  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2394  kmp_int64 chunk) {
2395  KMP_DEBUG_ASSERT(__kmp_init_serial);
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397  OMPT_STORE_RETURN_ADDRESS(gtid);
2398 #endif
2399  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2400  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2401 }
2402 
2416 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2417  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419  OMPT_STORE_RETURN_ADDRESS(gtid);
2420 #endif
2421  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422 #if OMPT_SUPPORT && OMPT_OPTIONAL
2423  ,
2424  OMPT_LOAD_RETURN_ADDRESS(gtid)
2425 #endif
2426  );
2427 }
2428 
2432 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2434  kmp_int32 *p_st) {
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436  OMPT_STORE_RETURN_ADDRESS(gtid);
2437 #endif
2438  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440  ,
2441  OMPT_LOAD_RETURN_ADDRESS(gtid)
2442 #endif
2443  );
2444 }
2445 
2449 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2450  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452  OMPT_STORE_RETURN_ADDRESS(gtid);
2453 #endif
2454  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455 #if OMPT_SUPPORT && OMPT_OPTIONAL
2456  ,
2457  OMPT_LOAD_RETURN_ADDRESS(gtid)
2458 #endif
2459  );
2460 }
2461 
2465 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2466  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2467  kmp_int64 *p_st) {
2468 #if OMPT_SUPPORT && OMPT_OPTIONAL
2469  OMPT_STORE_RETURN_ADDRESS(gtid);
2470 #endif
2471  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473  ,
2474  OMPT_LOAD_RETURN_ADDRESS(gtid)
2475 #endif
2476  );
2477 }
2478 
2485 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2486  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2487 }
2488 
2492 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2493  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2494 }
2495 
2499 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2500  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2501 }
2502 
2506 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2507  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2508 }
2511 //-----------------------------------------------------------------------------
2512 // Non-template routines from kmp_dispatch.cpp used in other sources
2513 
2514 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2515  return value == checker;
2516 }
2517 
2518 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2519  return value != checker;
2520 }
2521 
2522 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2523  return value < checker;
2524 }
2525 
2526 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2527  return value >= checker;
2528 }
2529 
2530 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2531  return value <= checker;
2532 }
2533 
2534 kmp_uint32
2535 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2536  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2537  void *obj // Higher-level synchronization object, or NULL.
2538  ) {
2539  // note: we may not belong to a team at this point
2540  volatile kmp_uint32 *spin = spinner;
2541  kmp_uint32 check = checker;
2542  kmp_uint32 spins;
2543  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2544  kmp_uint32 r;
2545 
2546  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2547  KMP_INIT_YIELD(spins);
2548  // main wait spin loop
2549  while (!f(r = TCR_4(*spin), check)) {
2550  KMP_FSYNC_SPIN_PREPARE(obj);
2551  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2552  split. It causes problems with infinite recursion because of exit lock */
2553  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2554  __kmp_abort_thread(); */
2555  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2556  }
2557  KMP_FSYNC_SPIN_ACQUIRED(obj);
2558  return r;
2559 }
2560 
2561 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2562  kmp_uint32 (*pred)(void *, kmp_uint32),
2563  void *obj // Higher-level synchronization object, or NULL.
2564  ) {
2565  // note: we may not belong to a team at this point
2566  void *spin = spinner;
2567  kmp_uint32 check = checker;
2568  kmp_uint32 spins;
2569  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2570 
2571  KMP_FSYNC_SPIN_INIT(obj, spin);
2572  KMP_INIT_YIELD(spins);
2573  // main wait spin loop
2574  while (!f(spin, check)) {
2575  KMP_FSYNC_SPIN_PREPARE(obj);
2576  /* if we have waited a bit, or are noversubscribed, yield */
2577  /* pause is in the following code */
2578  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2579  }
2580  KMP_FSYNC_SPIN_ACQUIRED(obj);
2581 }
2582 
2583 } // extern "C"
2584 
2585 #ifdef KMP_GOMP_COMPAT
2586 
2587 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2588  enum sched_type schedule, kmp_int32 lb,
2589  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2590  int push_ws) {
2591  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2592  push_ws);
2593 }
2594 
2595 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2596  enum sched_type schedule, kmp_uint32 lb,
2597  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2598  int push_ws) {
2599  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2600  push_ws);
2601 }
2602 
2603 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2604  enum sched_type schedule, kmp_int64 lb,
2605  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2606  int push_ws) {
2607  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2608  push_ws);
2609 }
2610 
2611 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2612  enum sched_type schedule, kmp_uint64 lb,
2613  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2614  int push_ws) {
2615  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2616  push_ws);
2617 }
2618 
2619 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2620  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2621 }
2622 
2623 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2624  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2625 }
2626 
2627 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2628  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2629 }
2630 
2631 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2632  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2633 }
2634 
2635 #endif /* KMP_GOMP_COMPAT */
2636 
2637 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:900
sched_type
Definition: kmp.h:343
Definition: kmp.h:229
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)