File: | build/source/openmp/runtime/src/kmp_dispatch.cpp |
Warning: | line 1946, column 5 Dereference of null pointer (loaded from variable 'p_last') |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* | |||
2 | * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch. | |||
3 | */ | |||
4 | ||||
5 | //===----------------------------------------------------------------------===// | |||
6 | // | |||
7 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
8 | // See https://llvm.org/LICENSE.txt for license information. | |||
9 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
10 | // | |||
11 | //===----------------------------------------------------------------------===// | |||
12 | ||||
13 | /* Dynamic scheduling initialization and dispatch. | |||
14 | * | |||
15 | * NOTE: __kmp_nth is a constant inside of any dispatch loop, however | |||
16 | * it may change values between parallel regions. __kmp_max_nth | |||
17 | * is the largest value __kmp_nth may take, 1 is the smallest. | |||
18 | */ | |||
19 | ||||
20 | #include "kmp.h" | |||
21 | #include "kmp_error.h" | |||
22 | #include "kmp_i18n.h" | |||
23 | #include "kmp_itt.h" | |||
24 | #include "kmp_stats.h" | |||
25 | #include "kmp_str.h" | |||
26 | #if KMP_USE_X87CONTROL0 | |||
27 | #include <float.h> | |||
28 | #endif | |||
29 | #include "kmp_lock.h" | |||
30 | #include "kmp_dispatch.h" | |||
31 | #if KMP_USE_HIER_SCHED0 | |||
32 | #include "kmp_dispatch_hier.h" | |||
33 | #endif | |||
34 | ||||
35 | #if OMPT_SUPPORT1 | |||
36 | #include "ompt-specific.h" | |||
37 | #endif | |||
38 | ||||
39 | /* ------------------------------------------------------------------------ */ | |||
40 | /* ------------------------------------------------------------------------ */ | |||
41 | ||||
42 | void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { | |||
43 | kmp_info_t *th; | |||
44 | ||||
45 | KMP_DEBUG_ASSERT(gtid_ref)if (!(gtid_ref)) { __kmp_debug_assert("gtid_ref", "openmp/runtime/src/kmp_dispatch.cpp" , 45); }; | |||
46 | ||||
47 | if (__kmp_env_consistency_check) { | |||
48 | th = __kmp_threads[*gtid_ref]; | |||
49 | if (th->th.th_root->r.r_active && | |||
50 | (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) { | |||
51 | #if KMP_USE_DYNAMIC_LOCK1 | |||
52 | __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL__null, 0); | |||
53 | #else | |||
54 | __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL__null); | |||
55 | #endif | |||
56 | } | |||
57 | } | |||
58 | } | |||
59 | ||||
60 | void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { | |||
61 | kmp_info_t *th; | |||
62 | ||||
63 | if (__kmp_env_consistency_check) { | |||
64 | th = __kmp_threads[*gtid_ref]; | |||
65 | if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) { | |||
66 | __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref); | |||
67 | } | |||
68 | } | |||
69 | } | |||
70 | ||||
71 | // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC | |||
72 | static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, | |||
73 | bool use_hier = false) { | |||
74 | // Pick up the nonmonotonic/monotonic bits from the scheduling type | |||
75 | // Nonmonotonic as default for dynamic schedule when no modifier is specified | |||
76 | int monotonicity = SCHEDULE_NONMONOTONIC0; | |||
77 | ||||
78 | // Let default be monotonic for executables | |||
79 | // compiled with OpenMP* 4.5 or less compilers | |||
80 | if (loc != NULL__null && loc->get_openmp_version() < 50) | |||
81 | monotonicity = SCHEDULE_MONOTONIC1; | |||
82 | ||||
83 | if (use_hier || __kmp_force_monotonic) | |||
84 | monotonicity = SCHEDULE_MONOTONIC1; | |||
85 | else if (SCHEDULE_HAS_NONMONOTONIC(schedule)(((schedule)&kmp_sch_modifier_nonmonotonic) != 0)) | |||
86 | monotonicity = SCHEDULE_NONMONOTONIC0; | |||
87 | else if (SCHEDULE_HAS_MONOTONIC(schedule)(((schedule)&kmp_sch_modifier_monotonic) != 0)) | |||
88 | monotonicity = SCHEDULE_MONOTONIC1; | |||
89 | ||||
90 | return monotonicity; | |||
91 | } | |||
92 | ||||
93 | #if KMP_STATIC_STEAL_ENABLED1 | |||
94 | enum { // values for steal_flag (possible states of private per-loop buffer) | |||
95 | UNUSED = 0, | |||
96 | CLAIMED = 1, // owner thread started initialization | |||
97 | READY = 2, // available for stealing | |||
98 | THIEF = 3 // finished by owner, or claimed by thief | |||
99 | // possible state changes: | |||
100 | // 0 -> 1 owner only, sync | |||
101 | // 0 -> 3 thief only, sync | |||
102 | // 1 -> 2 owner only, async | |||
103 | // 2 -> 3 owner only, async | |||
104 | // 3 -> 2 owner only, async | |||
105 | // 3 -> 0 last thread finishing the loop, async | |||
106 | }; | |||
107 | #endif | |||
108 | ||||
109 | // Initialize a dispatch_private_info_template<T> buffer for a particular | |||
110 | // type of schedule,chunk. The loop description is found in lb (lower bound), | |||
111 | // ub (upper bound), and st (stride). nproc is the number of threads relevant | |||
112 | // to the scheduling (often the number of threads in a team, but not always if | |||
113 | // hierarchical scheduling is used). tid is the id of the thread calling | |||
114 | // the function within the group of nproc threads. It will have a value | |||
115 | // between 0 and nproc - 1. This is often just the thread id within a team, but | |||
116 | // is not necessarily the case when using hierarchical scheduling. | |||
117 | // loc is the source file location of the corresponding loop | |||
118 | // gtid is the global thread id | |||
119 | template <typename T> | |||
120 | void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, | |||
121 | dispatch_private_info_template<T> *pr, | |||
122 | enum sched_type schedule, T lb, T ub, | |||
123 | typename traits_t<T>::signed_t st, | |||
124 | #if USE_ITT_BUILD1 | |||
125 | kmp_uint64 *cur_chunk, | |||
126 | #endif | |||
127 | typename traits_t<T>::signed_t chunk, | |||
128 | T nproc, T tid) { | |||
129 | typedef typename traits_t<T>::unsigned_t UT; | |||
130 | typedef typename traits_t<T>::floating_t DBL; | |||
131 | ||||
132 | int active; | |||
133 | T tc; | |||
134 | kmp_info_t *th; | |||
135 | kmp_team_t *team; | |||
136 | int monotonicity; | |||
137 | bool use_hier; | |||
138 | ||||
139 | #ifdef KMP_DEBUG1 | |||
140 | typedef typename traits_t<T>::signed_t ST; | |||
141 | { | |||
142 | char *buff; | |||
143 | // create format specifiers before the debug output | |||
144 | buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called " | |||
145 | "pr:%%p lb:%%%s ub:%%%s st:%%%s " | |||
146 | "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n", | |||
147 | traits_t<T>::spec, traits_t<T>::spec, | |||
148 | traits_t<ST>::spec, traits_t<ST>::spec, | |||
149 | traits_t<T>::spec, traits_t<T>::spec); | |||
150 | KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, pr , lb, ub, st, schedule, chunk, nproc, tid); }; | |||
151 | __kmp_str_free(&buff); | |||
152 | } | |||
153 | #endif | |||
154 | /* setup data */ | |||
155 | th = __kmp_threads[gtid]; | |||
156 | team = th->th.th_team; | |||
157 | active = !team->t.t_serialized; | |||
158 | ||||
159 | #if USE_ITT_BUILD1 | |||
160 | int itt_need_metadata_reporting = | |||
161 | __itt_metadata_add_ptr__kmp_itt_metadata_add_ptr__3_0 && __kmp_forkjoin_frames_mode == 3 && | |||
162 | KMP_MASTER_GTID(gtid)(0 == __kmp_tid_from_gtid((gtid))) && th->th.th_teams_microtask == NULL__null && | |||
163 | team->t.t_active_level == 1; | |||
164 | #endif | |||
165 | ||||
166 | #if KMP_USE_HIER_SCHED0 | |||
167 | use_hier = pr->flags.use_hier; | |||
168 | #else | |||
169 | use_hier = false; | |||
170 | #endif | |||
171 | ||||
172 | /* Pick up the nonmonotonic/monotonic bits from the scheduling type */ | |||
173 | monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); | |||
174 | schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule)(enum sched_type)( (schedule) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)); | |||
175 | ||||
176 | /* Pick up the nomerge/ordered bits from the scheduling type */ | |||
177 | if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) { | |||
178 | pr->flags.nomerge = TRUE(!0); | |||
179 | schedule = | |||
180 | (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower)); | |||
181 | } else { | |||
182 | pr->flags.nomerge = FALSE0; | |||
183 | } | |||
184 | pr->type_size = traits_t<T>::type_size; // remember the size of variables | |||
185 | if (kmp_ord_lower & schedule) { | |||
186 | pr->flags.ordered = TRUE(!0); | |||
187 | schedule = | |||
188 | (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower)); | |||
189 | } else { | |||
190 | pr->flags.ordered = FALSE0; | |||
191 | } | |||
192 | // Ordered overrides nonmonotonic | |||
193 | if (pr->flags.ordered) { | |||
194 | monotonicity = SCHEDULE_MONOTONIC1; | |||
195 | } | |||
196 | ||||
197 | if (schedule == kmp_sch_static) { | |||
198 | schedule = __kmp_static; | |||
199 | } else { | |||
200 | if (schedule == kmp_sch_runtime) { | |||
201 | // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if | |||
202 | // not specified) | |||
203 | schedule = team->t.t_sched.r_sched_type; | |||
204 | monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); | |||
205 | schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule)(enum sched_type)( (schedule) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)); | |||
206 | if (pr->flags.ordered) // correct monotonicity for ordered loop if needed | |||
207 | monotonicity = SCHEDULE_MONOTONIC1; | |||
208 | // Detail the schedule if needed (global controls are differentiated | |||
209 | // appropriately) | |||
210 | if (schedule == kmp_sch_guided_chunked) { | |||
211 | schedule = __kmp_guided; | |||
212 | } else if (schedule == kmp_sch_static) { | |||
213 | schedule = __kmp_static; | |||
214 | } | |||
215 | // Use the chunk size specified by OMP_SCHEDULE (or default if not | |||
216 | // specified) | |||
217 | chunk = team->t.t_sched.chunk; | |||
218 | #if USE_ITT_BUILD1 | |||
219 | if (cur_chunk) | |||
220 | *cur_chunk = chunk; | |||
221 | #endif | |||
222 | #ifdef KMP_DEBUG1 | |||
223 | { | |||
224 | char *buff; | |||
225 | // create format specifiers before the debug output | |||
226 | buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: " | |||
227 | "schedule:%%d chunk:%%%s\n", | |||
228 | traits_t<ST>::spec); | |||
229 | KD_TRACE(10, (buff, gtid, schedule, chunk))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, schedule , chunk); }; | |||
230 | __kmp_str_free(&buff); | |||
231 | } | |||
232 | #endif | |||
233 | } else { | |||
234 | if (schedule == kmp_sch_guided_chunked) { | |||
235 | schedule = __kmp_guided; | |||
236 | } | |||
237 | if (chunk <= 0) { | |||
238 | chunk = KMP_DEFAULT_CHUNK1; | |||
239 | } | |||
240 | } | |||
241 | ||||
242 | if (schedule == kmp_sch_auto) { | |||
243 | // mapping and differentiation: in the __kmp_do_serial_initialize() | |||
244 | schedule = __kmp_auto; | |||
245 | #ifdef KMP_DEBUG1 | |||
246 | { | |||
247 | char *buff; | |||
248 | // create format specifiers before the debug output | |||
249 | buff = __kmp_str_format( | |||
250 | "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " | |||
251 | "schedule:%%d chunk:%%%s\n", | |||
252 | traits_t<ST>::spec); | |||
253 | KD_TRACE(10, (buff, gtid, schedule, chunk))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, schedule , chunk); }; | |||
254 | __kmp_str_free(&buff); | |||
255 | } | |||
256 | #endif | |||
257 | } | |||
258 | #if KMP_STATIC_STEAL_ENABLED1 | |||
259 | // map nonmonotonic:dynamic to static steal | |||
260 | if (schedule == kmp_sch_dynamic_chunked) { | |||
261 | if (monotonicity == SCHEDULE_NONMONOTONIC0) | |||
262 | schedule = kmp_sch_static_steal; | |||
263 | } | |||
264 | #endif | |||
265 | /* guided analytical not safe for too many threads */ | |||
266 | if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) { | |||
267 | schedule = kmp_sch_guided_iterative_chunked; | |||
268 | KMP_WARNING(DispatchManyThreads)__kmp_msg(kmp_ms_warning, __kmp_msg_format(kmp_i18n_msg_DispatchManyThreads ), __kmp_msg_null); | |||
269 | } | |||
270 | if (schedule == kmp_sch_runtime_simd) { | |||
271 | // compiler provides simd_width in the chunk parameter | |||
272 | schedule = team->t.t_sched.r_sched_type; | |||
273 | monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); | |||
274 | schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule)(enum sched_type)( (schedule) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)); | |||
275 | // Detail the schedule if needed (global controls are differentiated | |||
276 | // appropriately) | |||
277 | if (schedule == kmp_sch_static || schedule == kmp_sch_auto || | |||
278 | schedule == __kmp_static) { | |||
279 | schedule = kmp_sch_static_balanced_chunked; | |||
280 | } else { | |||
281 | if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { | |||
282 | schedule = kmp_sch_guided_simd; | |||
283 | } | |||
284 | chunk = team->t.t_sched.chunk * chunk; | |||
285 | } | |||
286 | #if USE_ITT_BUILD1 | |||
287 | if (cur_chunk) | |||
288 | *cur_chunk = chunk; | |||
289 | #endif | |||
290 | #ifdef KMP_DEBUG1 | |||
291 | { | |||
292 | char *buff; | |||
293 | // create format specifiers before the debug output | |||
294 | buff = __kmp_str_format( | |||
295 | "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" | |||
296 | " chunk:%%%s\n", | |||
297 | traits_t<ST>::spec); | |||
298 | KD_TRACE(10, (buff, gtid, schedule, chunk))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, schedule , chunk); }; | |||
299 | __kmp_str_free(&buff); | |||
300 | } | |||
301 | #endif | |||
302 | } | |||
303 | pr->u.p.parm1 = chunk; | |||
304 | } | |||
305 | KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),if (!((kmp_sch_lower < schedule && schedule < kmp_sch_upper ))) { __kmp_debug_assert(("unknown scheduling type"), "openmp/runtime/src/kmp_dispatch.cpp" , 306); } | |||
306 | "unknown scheduling type")if (!((kmp_sch_lower < schedule && schedule < kmp_sch_upper ))) { __kmp_debug_assert(("unknown scheduling type"), "openmp/runtime/src/kmp_dispatch.cpp" , 306); }; | |||
307 | ||||
308 | pr->u.p.count = 0; | |||
309 | ||||
310 | if (__kmp_env_consistency_check) { | |||
311 | if (st == 0) { | |||
312 | __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, | |||
313 | (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc); | |||
314 | } | |||
315 | } | |||
316 | // compute trip count | |||
317 | if (st == 1) { // most common case | |||
318 | if (ub >= lb) { | |||
319 | tc = ub - lb + 1; | |||
320 | } else { // ub < lb | |||
321 | tc = 0; // zero-trip | |||
322 | } | |||
323 | } else if (st < 0) { | |||
324 | if (lb >= ub) { | |||
325 | // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B), | |||
326 | // where the division needs to be unsigned regardless of the result type | |||
327 | tc = (UT)(lb - ub) / (-st) + 1; | |||
328 | } else { // lb < ub | |||
329 | tc = 0; // zero-trip | |||
330 | } | |||
331 | } else { // st > 0 | |||
332 | if (ub >= lb) { | |||
333 | // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B), | |||
334 | // where the division needs to be unsigned regardless of the result type | |||
335 | tc = (UT)(ub - lb) / st + 1; | |||
336 | } else { // ub < lb | |||
337 | tc = 0; // zero-trip | |||
338 | } | |||
339 | } | |||
340 | ||||
341 | #if KMP_STATS_ENABLED0 | |||
342 | if (KMP_MASTER_GTID(gtid)(0 == __kmp_tid_from_gtid((gtid)))) { | |||
343 | KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc)((void)0); | |||
344 | } | |||
345 | #endif | |||
346 | ||||
347 | pr->u.p.lb = lb; | |||
348 | pr->u.p.ub = ub; | |||
349 | pr->u.p.st = st; | |||
350 | pr->u.p.tc = tc; | |||
351 | ||||
352 | #if KMP_OS_WINDOWS0 | |||
353 | pr->u.p.last_upper = ub + st; | |||
354 | #endif /* KMP_OS_WINDOWS */ | |||
355 | ||||
356 | /* NOTE: only the active parallel region(s) has active ordered sections */ | |||
357 | ||||
358 | if (active) { | |||
359 | if (pr->flags.ordered) { | |||
360 | pr->ordered_bumped = 0; | |||
361 | pr->u.p.ordered_lower = 1; | |||
362 | pr->u.p.ordered_upper = 0; | |||
363 | } | |||
364 | } | |||
365 | ||||
366 | switch (schedule) { | |||
367 | #if KMP_STATIC_STEAL_ENABLED1 | |||
368 | case kmp_sch_static_steal: { | |||
369 | T ntc, init; | |||
370 | ||||
371 | KD_TRACE(100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n" , gtid); } | |||
372 | ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n" , gtid); } | |||
373 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n" , gtid); }; | |||
374 | ||||
375 | ntc = (tc % chunk ? 1 : 0) + tc / chunk; | |||
376 | if (nproc > 1 && ntc >= nproc) { | |||
377 | KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL)((void)0); | |||
378 | T id = tid; | |||
379 | T small_chunk, extras; | |||
380 | kmp_uint32 old = UNUSED; | |||
381 | int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); | |||
382 | if (traits_t<T>::type_size > 4) { | |||
383 | // AC: TODO: check if 16-byte CAS available and use it to | |||
384 | // improve performance (probably wait for explicit request | |||
385 | // before spending time on this). | |||
386 | // For now use dynamically allocated per-private-buffer lock, | |||
387 | // free memory in __kmp_dispatch_next when status==0. | |||
388 | pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t))___kmp_allocate((sizeof(kmp_lock_t)), "openmp/runtime/src/kmp_dispatch.cpp" , 388); | |||
389 | __kmp_init_lock(pr->u.p.steal_lock); | |||
390 | } | |||
391 | small_chunk = ntc / nproc; | |||
392 | extras = ntc % nproc; | |||
393 | ||||
394 | init = id * small_chunk + (id < extras ? id : extras); | |||
395 | pr->u.p.count = init; | |||
396 | if (claimed) { // are we succeeded in claiming own buffer? | |||
397 | pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); | |||
398 | // Other threads will inspect steal_flag when searching for a victim. | |||
399 | // READY means other threads may steal from this thread from now on. | |||
400 | KMP_ATOMIC_ST_REL(&pr->steal_flag, READY)(&pr->steal_flag)->store(READY, std::memory_order_release ); | |||
401 | } else { | |||
402 | // other thread has stolen whole our range | |||
403 | KMP_DEBUG_ASSERT(pr->steal_flag == THIEF)if (!(pr->steal_flag == THIEF)) { __kmp_debug_assert("pr->steal_flag == THIEF" , "openmp/runtime/src/kmp_dispatch.cpp", 403); }; | |||
404 | pr->u.p.ub = init; // mark there is no iterations to work on | |||
405 | } | |||
406 | pr->u.p.parm2 = ntc; // save number of chunks | |||
407 | // parm3 is the number of times to attempt stealing which is | |||
408 | // nproc (just a heuristics, could be optimized later on). | |||
409 | pr->u.p.parm3 = nproc; | |||
410 | pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid | |||
411 | break; | |||
412 | } else { | |||
413 | /* too few chunks: switching to kmp_sch_dynamic_chunked */ | |||
414 | schedule = kmp_sch_dynamic_chunked; | |||
415 | KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d switching to " "kmp_sch_dynamic_chunked\n", gtid); } | |||
416 | "kmp_sch_dynamic_chunked\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d switching to " "kmp_sch_dynamic_chunked\n", gtid); } | |||
417 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d switching to " "kmp_sch_dynamic_chunked\n", gtid); }; | |||
418 | goto dynamic_init; | |||
419 | break; | |||
420 | } // if | |||
421 | } // case | |||
422 | #endif | |||
423 | case kmp_sch_static_balanced: { | |||
424 | T init, limit; | |||
425 | ||||
426 | KD_TRACE(if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); } | |||
427 | 100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); } | |||
428 | ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); } | |||
429 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); }; | |||
430 | ||||
431 | if (nproc > 1) { | |||
432 | T id = tid; | |||
433 | ||||
434 | if (tc < nproc) { | |||
435 | if (id < tc) { | |||
436 | init = id; | |||
437 | limit = id; | |||
438 | pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */ | |||
439 | } else { | |||
440 | pr->u.p.count = 1; /* means no more chunks to execute */ | |||
441 | pr->u.p.parm1 = FALSE0; | |||
442 | break; | |||
443 | } | |||
444 | } else { | |||
445 | T small_chunk = tc / nproc; | |||
446 | T extras = tc % nproc; | |||
447 | init = id * small_chunk + (id < extras ? id : extras); | |||
448 | limit = init + small_chunk - (id < extras ? 0 : 1); | |||
449 | pr->u.p.parm1 = (id == nproc - 1); | |||
450 | } | |||
451 | } else { | |||
452 | if (tc > 0) { | |||
453 | init = 0; | |||
454 | limit = tc - 1; | |||
455 | pr->u.p.parm1 = TRUE(!0); | |||
456 | } else { | |||
457 | // zero trip count | |||
458 | pr->u.p.count = 1; /* means no more chunks to execute */ | |||
459 | pr->u.p.parm1 = FALSE0; | |||
460 | break; | |||
461 | } | |||
462 | } | |||
463 | #if USE_ITT_BUILD1 | |||
464 | // Calculate chunk for metadata report | |||
465 | if (itt_need_metadata_reporting) | |||
466 | if (cur_chunk) | |||
467 | *cur_chunk = limit - init + 1; | |||
468 | #endif | |||
469 | if (st == 1) { | |||
470 | pr->u.p.lb = lb + init; | |||
471 | pr->u.p.ub = lb + limit; | |||
472 | } else { | |||
473 | // calculated upper bound, "ub" is user-defined upper bound | |||
474 | T ub_tmp = lb + limit * st; | |||
475 | pr->u.p.lb = lb + init * st; | |||
476 | // adjust upper bound to "ub" if needed, so that MS lastprivate will match | |||
477 | // it exactly | |||
478 | if (st > 0) { | |||
479 | pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp); | |||
480 | } else { | |||
481 | pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp); | |||
482 | } | |||
483 | } | |||
484 | if (pr->flags.ordered) { | |||
485 | pr->u.p.ordered_lower = init; | |||
486 | pr->u.p.ordered_upper = limit; | |||
487 | } | |||
488 | break; | |||
489 | } // case | |||
490 | case kmp_sch_static_balanced_chunked: { | |||
491 | // similar to balanced, but chunk adjusted to multiple of simd width | |||
492 | T nth = nproc; | |||
493 | KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" " -> falling-through to static_greedy\n", gtid); } | |||
494 | " -> falling-through to static_greedy\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" " -> falling-through to static_greedy\n", gtid); } | |||
495 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" " -> falling-through to static_greedy\n", gtid); }; | |||
496 | schedule = kmp_sch_static_greedy; | |||
497 | if (nth > 1) | |||
498 | pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); | |||
499 | else | |||
500 | pr->u.p.parm1 = tc; | |||
501 | break; | |||
502 | } // case | |||
503 | case kmp_sch_guided_simd: | |||
504 | case kmp_sch_guided_iterative_chunked: { | |||
505 | KD_TRACE(if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" " case\n", gtid); } | |||
506 | 100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" " case\n", gtid); } | |||
507 | ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" " case\n", gtid); } | |||
508 | " case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" " case\n", gtid); } | |||
509 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" " case\n", gtid); }; | |||
510 | ||||
511 | if (nproc > 1) { | |||
512 | if ((2L * chunk + 1) * nproc >= tc) { | |||
513 | /* chunk size too large, switch to dynamic */ | |||
514 | schedule = kmp_sch_dynamic_chunked; | |||
515 | goto dynamic_init; | |||
516 | } else { | |||
517 | // when remaining iters become less than parm2 - switch to dynamic | |||
518 | pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1); | |||
519 | *(double *)&pr->u.p.parm3 = | |||
520 | guided_flt_param / (double)nproc; // may occupy parm3 and parm4 | |||
521 | } | |||
522 | } else { | |||
523 | KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d falling-through to " "kmp_sch_static_greedy\n", gtid); } | |||
524 | "kmp_sch_static_greedy\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d falling-through to " "kmp_sch_static_greedy\n", gtid); } | |||
525 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d falling-through to " "kmp_sch_static_greedy\n", gtid); }; | |||
526 | schedule = kmp_sch_static_greedy; | |||
527 | /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ | |||
528 | KD_TRACE(if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); } | |||
529 | 100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); } | |||
530 | ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); } | |||
531 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); }; | |||
532 | pr->u.p.parm1 = tc; | |||
533 | } // if | |||
534 | } // case | |||
535 | break; | |||
536 | case kmp_sch_guided_analytical_chunked: { | |||
537 | KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_guided_analytical_chunked case\n", gtid); } | |||
538 | "kmp_sch_guided_analytical_chunked case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_guided_analytical_chunked case\n", gtid); } | |||
539 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_guided_analytical_chunked case\n", gtid); }; | |||
540 | ||||
541 | if (nproc > 1) { | |||
542 | if ((2L * chunk + 1) * nproc >= tc) { | |||
543 | /* chunk size too large, switch to dynamic */ | |||
544 | schedule = kmp_sch_dynamic_chunked; | |||
545 | goto dynamic_init; | |||
546 | } else { | |||
547 | /* commonly used term: (2 nproc - 1)/(2 nproc) */ | |||
548 | DBL x; | |||
549 | ||||
550 | #if KMP_USE_X87CONTROL0 | |||
551 | /* Linux* OS already has 64-bit computation by default for long double, | |||
552 | and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On | |||
553 | Windows* OS on IA-32 architecture, we need to set precision to 64-bit | |||
554 | instead of the default 53-bit. Even though long double doesn't work | |||
555 | on Windows* OS on Intel(R) 64, the resulting lack of precision is not | |||
556 | expected to impact the correctness of the algorithm, but this has not | |||
557 | been mathematically proven. */ | |||
558 | // save original FPCW and set precision to 64-bit, as | |||
559 | // Windows* OS on IA-32 architecture defaults to 53-bit | |||
560 | unsigned int oldFpcw = _control87(0, 0); | |||
561 | _control87(_PC_64, _MCW_PC); // 0,0x30000 | |||
562 | #endif | |||
563 | /* value used for comparison in solver for cross-over point */ | |||
564 | KMP_ASSERT(tc > 0)if (!(tc > 0)) { __kmp_debug_assert("tc > 0", "openmp/runtime/src/kmp_dispatch.cpp" , 564); }; | |||
565 | long double target = ((long double)chunk * 2 + 1) * nproc / tc; | |||
566 | ||||
567 | /* crossover point--chunk indexes equal to or greater than | |||
568 | this point switch to dynamic-style scheduling */ | |||
569 | UT cross; | |||
570 | ||||
571 | /* commonly used term: (2 nproc - 1)/(2 nproc) */ | |||
572 | x = 1.0 - 0.5 / (double)nproc; | |||
573 | ||||
574 | #ifdef KMP_DEBUG1 | |||
575 | { // test natural alignment | |||
576 | struct _test_a { | |||
577 | char a; | |||
578 | union { | |||
579 | char b; | |||
580 | DBL d; | |||
581 | }; | |||
582 | } t; | |||
583 | ptrdiff_t natural_alignment = | |||
584 | (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1; | |||
585 | //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long | |||
586 | // long)natural_alignment ); | |||
587 | KMP_DEBUG_ASSERT(if (!((((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment )) == 0)) { __kmp_debug_assert("(((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0" , "openmp/runtime/src/kmp_dispatch.cpp", 588); } | |||
588 | (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0)if (!((((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment )) == 0)) { __kmp_debug_assert("(((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0" , "openmp/runtime/src/kmp_dispatch.cpp", 588); }; | |||
589 | } | |||
590 | #endif // KMP_DEBUG | |||
591 | ||||
592 | /* save the term in thread private dispatch structure */ | |||
593 | *(DBL *)&pr->u.p.parm3 = x; | |||
594 | ||||
595 | /* solve for the crossover point to the nearest integer i for which C_i | |||
596 | <= chunk */ | |||
597 | { | |||
598 | UT left, right, mid; | |||
599 | long double p; | |||
600 | ||||
601 | /* estimate initial upper and lower bound */ | |||
602 | ||||
603 | /* doesn't matter what value right is as long as it is positive, but | |||
604 | it affects performance of the solver */ | |||
605 | right = 229; | |||
606 | p = __kmp_pow<UT>(x, right); | |||
607 | if (p > target) { | |||
608 | do { | |||
609 | p *= p; | |||
610 | right <<= 1; | |||
611 | } while (p > target && right < (1 << 27)); | |||
612 | /* lower bound is previous (failed) estimate of upper bound */ | |||
613 | left = right >> 1; | |||
614 | } else { | |||
615 | left = 0; | |||
616 | } | |||
617 | ||||
618 | /* bisection root-finding method */ | |||
619 | while (left + 1 < right) { | |||
620 | mid = (left + right) / 2; | |||
621 | if (__kmp_pow<UT>(x, mid) > target) { | |||
622 | left = mid; | |||
623 | } else { | |||
624 | right = mid; | |||
625 | } | |||
626 | } // while | |||
627 | cross = right; | |||
628 | } | |||
629 | /* assert sanity of computed crossover point */ | |||
630 | KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&if (!(cross && __kmp_pow<UT>(x, cross - 1) > target && __kmp_pow<UT>(x, cross) <= target )) { __kmp_debug_assert("cross && __kmp_pow<UT>(x, cross - 1) > target && __kmp_pow<UT>(x, cross) <= target" , "openmp/runtime/src/kmp_dispatch.cpp", 631); } | |||
631 | __kmp_pow<UT>(x, cross) <= target)if (!(cross && __kmp_pow<UT>(x, cross - 1) > target && __kmp_pow<UT>(x, cross) <= target )) { __kmp_debug_assert("cross && __kmp_pow<UT>(x, cross - 1) > target && __kmp_pow<UT>(x, cross) <= target" , "openmp/runtime/src/kmp_dispatch.cpp", 631); }; | |||
632 | ||||
633 | /* save the crossover point in thread private dispatch structure */ | |||
634 | pr->u.p.parm2 = cross; | |||
635 | ||||
636 | // C75803 | |||
637 | #if ((KMP_OS_LINUX1 || KMP_OS_WINDOWS0) && KMP_ARCH_X860) && (!defined(KMP_I8)) | |||
638 | #define GUIDED_ANALYTICAL_WORKAROUND(x) (*(DBL *)&pr->u.p.parm3) | |||
639 | #else | |||
640 | #define GUIDED_ANALYTICAL_WORKAROUND(x) (x) | |||
641 | #endif | |||
642 | /* dynamic-style scheduling offset */ | |||
643 | pr->u.p.count = tc - | |||
644 | __kmp_dispatch_guided_remaining( | |||
645 | tc, GUIDED_ANALYTICAL_WORKAROUND(x), cross) - | |||
646 | cross * chunk; | |||
647 | #if KMP_USE_X87CONTROL0 | |||
648 | // restore FPCW | |||
649 | _control87(oldFpcw, _MCW_PC); | |||
650 | #endif | |||
651 | } // if | |||
652 | } else { | |||
653 | KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d falling-through to " "kmp_sch_static_greedy\n", gtid); } | |||
654 | "kmp_sch_static_greedy\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d falling-through to " "kmp_sch_static_greedy\n", gtid); } | |||
655 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d falling-through to " "kmp_sch_static_greedy\n", gtid); }; | |||
656 | schedule = kmp_sch_static_greedy; | |||
657 | /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */ | |||
658 | pr->u.p.parm1 = tc; | |||
659 | } // if | |||
660 | } // case | |||
661 | break; | |||
662 | case kmp_sch_static_greedy: | |||
663 | KD_TRACE(if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); } | |||
664 | 100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); } | |||
665 | ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); } | |||
666 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n" , gtid); }; | |||
667 | pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc; | |||
668 | break; | |||
669 | case kmp_sch_static_chunked: | |||
670 | case kmp_sch_dynamic_chunked: | |||
671 | dynamic_init: | |||
672 | if (tc == 0) | |||
673 | break; | |||
674 | if (pr->u.p.parm1 <= 0) | |||
675 | pr->u.p.parm1 = KMP_DEFAULT_CHUNK1; | |||
676 | else if (pr->u.p.parm1 > tc) | |||
677 | pr->u.p.parm1 = tc; | |||
678 | // Store the total number of chunks to prevent integer overflow during | |||
679 | // bounds calculations in the get next chunk routine. | |||
680 | pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0); | |||
681 | KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid ); } | |||
682 | "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid ); } | |||
683 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid ); }; | |||
684 | break; | |||
685 | case kmp_sch_trapezoidal: { | |||
686 | /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */ | |||
687 | ||||
688 | T parm1, parm2, parm3, parm4; | |||
689 | KD_TRACE(100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n" , gtid); } | |||
690 | ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n" , gtid); } | |||
691 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n" , gtid); }; | |||
692 | ||||
693 | parm1 = chunk; | |||
694 | ||||
695 | /* F : size of the first cycle */ | |||
696 | parm2 = (tc / (2 * nproc)); | |||
697 | ||||
698 | if (parm2 < 1) { | |||
699 | parm2 = 1; | |||
700 | } | |||
701 | ||||
702 | /* L : size of the last cycle. Make sure the last cycle is not larger | |||
703 | than the first cycle. */ | |||
704 | if (parm1 < 1) { | |||
705 | parm1 = 1; | |||
706 | } else if (parm1 > parm2) { | |||
707 | parm1 = parm2; | |||
708 | } | |||
709 | ||||
710 | /* N : number of cycles */ | |||
711 | parm3 = (parm2 + parm1); | |||
712 | parm3 = (2 * tc + parm3 - 1) / parm3; | |||
713 | ||||
714 | if (parm3 < 2) { | |||
715 | parm3 = 2; | |||
716 | } | |||
717 | ||||
718 | /* sigma : decreasing incr of the trapezoid */ | |||
719 | parm4 = (parm3 - 1); | |||
720 | parm4 = (parm2 - parm1) / parm4; | |||
721 | ||||
722 | // pointless check, because parm4 >= 0 always | |||
723 | // if ( parm4 < 0 ) { | |||
724 | // parm4 = 0; | |||
725 | //} | |||
726 | ||||
727 | pr->u.p.parm1 = parm1; | |||
728 | pr->u.p.parm2 = parm2; | |||
729 | pr->u.p.parm3 = parm3; | |||
730 | pr->u.p.parm4 = parm4; | |||
731 | } // case | |||
732 | break; | |||
733 | ||||
734 | default: { | |||
735 | __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected)__kmp_msg_format(kmp_i18n_msg_UnknownSchedTypeDetected), // Primary message | |||
736 | KMP_HNT(GetNewerLibrary)__kmp_msg_format(kmp_i18n_hnt_GetNewerLibrary), // Hint | |||
737 | __kmp_msg_null // Variadic argument list terminator | |||
738 | ); | |||
739 | } break; | |||
740 | } // switch | |||
741 | pr->schedule = schedule; | |||
742 | } | |||
743 | ||||
744 | #if KMP_USE_HIER_SCHED0 | |||
745 | template <typename T> | |||
746 | inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub, | |||
747 | typename traits_t<T>::signed_t st); | |||
748 | template <> | |||
749 | inline void | |||
750 | __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb, | |||
751 | kmp_int32 ub, kmp_int32 st) { | |||
752 | __kmp_dispatch_init_hierarchy<kmp_int32>( | |||
753 | loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, | |||
754 | __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); | |||
755 | } | |||
756 | template <> | |||
757 | inline void | |||
758 | __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb, | |||
759 | kmp_uint32 ub, kmp_int32 st) { | |||
760 | __kmp_dispatch_init_hierarchy<kmp_uint32>( | |||
761 | loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, | |||
762 | __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st); | |||
763 | } | |||
764 | template <> | |||
765 | inline void | |||
766 | __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb, | |||
767 | kmp_int64 ub, kmp_int64 st) { | |||
768 | __kmp_dispatch_init_hierarchy<kmp_int64>( | |||
769 | loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, | |||
770 | __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); | |||
771 | } | |||
772 | template <> | |||
773 | inline void | |||
774 | __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb, | |||
775 | kmp_uint64 ub, kmp_int64 st) { | |||
776 | __kmp_dispatch_init_hierarchy<kmp_uint64>( | |||
777 | loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers, | |||
778 | __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st); | |||
779 | } | |||
780 | ||||
781 | // free all the hierarchy scheduling memory associated with the team | |||
782 | void __kmp_dispatch_free_hierarchies(kmp_team_t *team) { | |||
783 | int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; | |||
784 | for (int i = 0; i < num_disp_buff; ++i) { | |||
785 | // type does not matter here so use kmp_int32 | |||
786 | auto sh = | |||
787 | reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( | |||
788 | &team->t.t_disp_buffer[i]); | |||
789 | if (sh->hier) { | |||
790 | sh->hier->deallocate(); | |||
791 | __kmp_free(sh->hier)___kmp_free((sh->hier), "openmp/runtime/src/kmp_dispatch.cpp" , 791); | |||
792 | } | |||
793 | } | |||
794 | } | |||
795 | #endif | |||
796 | ||||
797 | // UT - unsigned flavor of T, ST - signed flavor of T, | |||
798 | // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8 | |||
799 | template <typename T> | |||
800 | static void | |||
801 | __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, | |||
802 | T ub, typename traits_t<T>::signed_t st, | |||
803 | typename traits_t<T>::signed_t chunk, int push_ws) { | |||
804 | typedef typename traits_t<T>::unsigned_t UT; | |||
805 | ||||
806 | int active; | |||
807 | kmp_info_t *th; | |||
808 | kmp_team_t *team; | |||
809 | kmp_uint32 my_buffer_index; | |||
810 | dispatch_private_info_template<T> *pr; | |||
811 | dispatch_shared_info_template<T> volatile *sh; | |||
812 | ||||
813 | KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==static_assert(sizeof(dispatch_private_info_template<T>) == sizeof(dispatch_private_info), "Build condition error") | |||
814 | sizeof(dispatch_private_info))static_assert(sizeof(dispatch_private_info_template<T>) == sizeof(dispatch_private_info), "Build condition error"); | |||
815 | KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==static_assert(sizeof(dispatch_shared_info_template<UT>) == sizeof(dispatch_shared_info), "Build condition error") | |||
816 | sizeof(dispatch_shared_info))static_assert(sizeof(dispatch_shared_info_template<UT>) == sizeof(dispatch_shared_info), "Build condition error"); | |||
817 | __kmp_assert_valid_gtid(gtid); | |||
818 | ||||
819 | if (!TCR_4(__kmp_init_parallel)(__kmp_init_parallel)) | |||
820 | __kmp_parallel_initialize(); | |||
821 | ||||
822 | __kmp_resume_if_soft_paused(); | |||
823 | ||||
824 | #if INCLUDE_SSC_MARKS(1 && 1) | |||
825 | SSC_MARK_DISPATCH_INIT()__asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0xd696) : "%ebx"); | |||
826 | #endif | |||
827 | #ifdef KMP_DEBUG1 | |||
828 | typedef typename traits_t<T>::signed_t ST; | |||
829 | { | |||
830 | char *buff; | |||
831 | // create format specifiers before the debug output | |||
832 | buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d " | |||
833 | "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n", | |||
834 | traits_t<ST>::spec, traits_t<T>::spec, | |||
835 | traits_t<T>::spec, traits_t<ST>::spec); | |||
836 | KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, schedule , chunk, lb, ub, st); }; | |||
837 | __kmp_str_free(&buff); | |||
838 | } | |||
839 | #endif | |||
840 | /* setup data */ | |||
841 | th = __kmp_threads[gtid]; | |||
842 | team = th->th.th_team; | |||
843 | active = !team->t.t_serialized; | |||
844 | th->th.th_ident = loc; | |||
845 | ||||
846 | // Any half-decent optimizer will remove this test when the blocks are empty | |||
847 | // since the macros expand to nothing | |||
848 | // when statistics are disabled. | |||
849 | if (schedule == __kmp_static) { | |||
850 | KMP_COUNT_BLOCK(OMP_LOOP_STATIC)((void)0); | |||
851 | } else { | |||
852 | KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC)((void)0); | |||
853 | } | |||
854 | ||||
855 | #if KMP_USE_HIER_SCHED0 | |||
856 | // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable | |||
857 | // Hierarchical scheduling does not work with ordered, so if ordered is | |||
858 | // detected, then revert back to threaded scheduling. | |||
859 | bool ordered; | |||
860 | enum sched_type my_sched = schedule; | |||
861 | my_buffer_index = th->th.th_dispatch->th_disp_index; | |||
862 | pr = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
863 | &th->th.th_dispatch | |||
864 | ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); | |||
865 | my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched)(enum sched_type)( (my_sched) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)); | |||
866 | if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper)) | |||
867 | my_sched = | |||
868 | (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower)); | |||
869 | ordered = (kmp_ord_lower & my_sched); | |||
870 | if (pr->flags.use_hier) { | |||
871 | if (ordered) { | |||
872 | KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d ordered loop detected. " "Disabling hierarchical scheduling.\n", gtid); } | |||
873 | "Disabling hierarchical scheduling.\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d ordered loop detected. " "Disabling hierarchical scheduling.\n", gtid); } | |||
874 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d ordered loop detected. " "Disabling hierarchical scheduling.\n", gtid); }; | |||
875 | pr->flags.use_hier = FALSE0; | |||
876 | } | |||
877 | } | |||
878 | if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) { | |||
879 | // Don't use hierarchical for ordered parallel loops and don't | |||
880 | // use the runtime hierarchy if one was specified in the program | |||
881 | if (!ordered && !pr->flags.use_hier) | |||
882 | __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st); | |||
883 | } | |||
884 | #endif // KMP_USE_HIER_SCHED | |||
885 | ||||
886 | #if USE_ITT_BUILD1 | |||
887 | kmp_uint64 cur_chunk = chunk; | |||
888 | int itt_need_metadata_reporting = | |||
889 | __itt_metadata_add_ptr__kmp_itt_metadata_add_ptr__3_0 && __kmp_forkjoin_frames_mode == 3 && | |||
890 | KMP_MASTER_GTID(gtid)(0 == __kmp_tid_from_gtid((gtid))) && th->th.th_teams_microtask == NULL__null && | |||
891 | team->t.t_active_level == 1; | |||
892 | #endif | |||
893 | if (!active) { | |||
894 | pr = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
895 | th->th.th_dispatch->th_disp_buffer); /* top of the stack */ | |||
896 | } else { | |||
897 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 898); } | |||
898 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 898); }; | |||
899 | ||||
900 | my_buffer_index = th->th.th_dispatch->th_disp_index++; | |||
901 | ||||
902 | /* What happens when number of threads changes, need to resize buffer? */ | |||
903 | pr = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
904 | &th->th.th_dispatch | |||
905 | ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); | |||
906 | sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( | |||
907 | &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); | |||
908 | KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n" , gtid, my_buffer_index); } | |||
909 | my_buffer_index))if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n" , gtid, my_buffer_index); }; | |||
910 | if (sh->buffer_index != my_buffer_index) { // too many loops in progress? | |||
911 | KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" " sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
912 | " sh->buffer_index:%d\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" " sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
913 | gtid, my_buffer_index, sh->buffer_index))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" " sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); }; | |||
914 | __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, | |||
915 | __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL), __null); | |||
916 | // Note: KMP_WAIT() cannot be used there: buffer index and | |||
917 | // my_buffer_index are *always* 32-bit integers. | |||
918 | KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
919 | "sh->buffer_index:%d\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
920 | gtid, my_buffer_index, sh->buffer_index))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); }; | |||
921 | } | |||
922 | } | |||
923 | ||||
924 | __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, | |||
925 | #if USE_ITT_BUILD1 | |||
926 | &cur_chunk, | |||
927 | #endif | |||
928 | chunk, (T)th->th.th_team_nproc, | |||
929 | (T)th->th.th_info.ds.ds_tid); | |||
930 | if (active) { | |||
931 | if (pr->flags.ordered == 0) { | |||
932 | th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; | |||
933 | th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; | |||
934 | } else { | |||
935 | th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>; | |||
936 | th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>; | |||
937 | } | |||
938 | th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; | |||
939 | th->th.th_dispatch->th_dispatch_sh_current = | |||
940 | CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh)const_cast<dispatch_shared_info_t *>((volatile dispatch_shared_info_t *)sh); | |||
941 | #if USE_ITT_BUILD1 | |||
942 | if (pr->flags.ordered) { | |||
943 | __kmp_itt_ordered_init(gtid); | |||
944 | } | |||
945 | // Report loop metadata | |||
946 | if (itt_need_metadata_reporting) { | |||
947 | // Only report metadata by primary thread of active team at level 1 | |||
948 | kmp_uint64 schedtype = 0; | |||
949 | switch (schedule) { | |||
950 | case kmp_sch_static_chunked: | |||
951 | case kmp_sch_static_balanced: // Chunk is calculated in the switch above | |||
952 | break; | |||
953 | case kmp_sch_static_greedy: | |||
954 | cur_chunk = pr->u.p.parm1; | |||
955 | break; | |||
956 | case kmp_sch_dynamic_chunked: | |||
957 | schedtype = 1; | |||
958 | break; | |||
959 | case kmp_sch_guided_iterative_chunked: | |||
960 | case kmp_sch_guided_analytical_chunked: | |||
961 | case kmp_sch_guided_simd: | |||
962 | schedtype = 2; | |||
963 | break; | |||
964 | default: | |||
965 | // Should we put this case under "static"? | |||
966 | // case kmp_sch_static_steal: | |||
967 | schedtype = 3; | |||
968 | break; | |||
969 | } | |||
970 | __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk); | |||
971 | } | |||
972 | #if KMP_USE_HIER_SCHED0 | |||
973 | if (pr->flags.use_hier) { | |||
974 | pr->u.p.count = 0; | |||
975 | pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0; | |||
976 | } | |||
977 | #endif // KMP_USER_HIER_SCHED | |||
978 | #endif /* USE_ITT_BUILD */ | |||
979 | } | |||
980 | ||||
981 | #ifdef KMP_DEBUG1 | |||
982 | { | |||
983 | char *buff; | |||
984 | // create format specifiers before the debug output | |||
985 | buff = __kmp_str_format( | |||
986 | "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " | |||
987 | "lb:%%%s ub:%%%s" | |||
988 | " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" | |||
989 | " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n", | |||
990 | traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec, | |||
991 | traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec, | |||
992 | traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec, | |||
993 | traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec); | |||
994 | KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, pr ->schedule, pr->flags.ordered, pr->u.p.lb, pr->u. p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr-> u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1 , pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4); } | |||
995 | pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, pr ->schedule, pr->flags.ordered, pr->u.p.lb, pr->u. p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr-> u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1 , pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4); } | |||
996 | pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, pr ->schedule, pr->flags.ordered, pr->u.p.lb, pr->u. p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr-> u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1 , pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4); } | |||
997 | pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, pr ->schedule, pr->flags.ordered, pr->u.p.lb, pr->u. p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr-> u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1 , pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4); }; | |||
998 | __kmp_str_free(&buff); | |||
999 | } | |||
1000 | #endif | |||
1001 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
1002 | if (ompt_enabled.ompt_callback_work) { | |||
1003 | ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL__null); | |||
1004 | ompt_task_info_t *task_info = __ompt_get_task_info_object(0); | |||
1005 | ompt_callbacks.ompt_callback(ompt_callback_work)ompt_callback_work_callback( | |||
1006 | ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data), | |||
1007 | &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid)__ompt_load_return_address(gtid)); | |||
1008 | } | |||
1009 | #endif | |||
1010 | KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic)((void)0); | |||
1011 | } | |||
1012 | ||||
1013 | /* For ordered loops, either __kmp_dispatch_finish() should be called after | |||
1014 | * every iteration, or __kmp_dispatch_finish_chunk() should be called after | |||
1015 | * every chunk of iterations. If the ordered section(s) were not executed | |||
1016 | * for this iteration (or every iteration in this chunk), we need to set the | |||
1017 | * ordered iteration counters so that the next thread can proceed. */ | |||
1018 | template <typename UT> | |||
1019 | static void __kmp_dispatch_finish(int gtid, ident_t *loc) { | |||
1020 | typedef typename traits_t<UT>::signed_t ST; | |||
1021 | __kmp_assert_valid_gtid(gtid); | |||
1022 | kmp_info_t *th = __kmp_threads[gtid]; | |||
1023 | ||||
1024 | KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d called\n" , gtid); }; | |||
1025 | if (!th->th.th_team->t.t_serialized) { | |||
1026 | ||||
1027 | dispatch_private_info_template<UT> *pr = | |||
1028 | reinterpret_cast<dispatch_private_info_template<UT> *>( | |||
1029 | th->th.th_dispatch->th_dispatch_pr_current); | |||
1030 | dispatch_shared_info_template<UT> volatile *sh = | |||
1031 | reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( | |||
1032 | th->th.th_dispatch->th_dispatch_sh_current); | |||
1033 | KMP_DEBUG_ASSERT(pr)if (!(pr)) { __kmp_debug_assert("pr", "openmp/runtime/src/kmp_dispatch.cpp" , 1033); }; | |||
1034 | KMP_DEBUG_ASSERT(sh)if (!(sh)) { __kmp_debug_assert("sh", "openmp/runtime/src/kmp_dispatch.cpp" , 1034); }; | |||
1035 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 1036); } | |||
1036 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 1036); }; | |||
1037 | ||||
1038 | if (pr->ordered_bumped) { | |||
1039 | KD_TRACE(if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); } | |||
1040 | 1000,if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); } | |||
1041 | ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); } | |||
1042 | gtid))if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); }; | |||
1043 | pr->ordered_bumped = 0; | |||
1044 | } else { | |||
1045 | UT lower = pr->u.p.ordered_lower; | |||
1046 | ||||
1047 | #ifdef KMP_DEBUG1 | |||
1048 | { | |||
1049 | char *buff; | |||
1050 | // create format specifiers before the debug output | |||
1051 | buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: " | |||
1052 | "ordered_iteration:%%%s lower:%%%s\n", | |||
1053 | traits_t<UT>::spec, traits_t<UT>::spec); | |||
1054 | KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower))if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, sh->u.s.ordered_iteration, lower); }; | |||
1055 | __kmp_str_free(&buff); | |||
1056 | } | |||
1057 | #endif | |||
1058 | ||||
1059 | __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, | |||
1060 | __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL), __null); | |||
1061 | KMP_MB(); /* is this necessary? */ | |||
1062 | #ifdef KMP_DEBUG1 | |||
1063 | { | |||
1064 | char *buff; | |||
1065 | // create format specifiers before the debug output | |||
1066 | buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: " | |||
1067 | "ordered_iteration:%%%s lower:%%%s\n", | |||
1068 | traits_t<UT>::spec, traits_t<UT>::spec); | |||
1069 | KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower))if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, sh->u.s.ordered_iteration, lower); }; | |||
1070 | __kmp_str_free(&buff); | |||
1071 | } | |||
1072 | #endif | |||
1073 | ||||
1074 | test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); | |||
1075 | } // if | |||
1076 | } // if | |||
1077 | KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d returned\n" , gtid); }; | |||
1078 | } | |||
1079 | ||||
1080 | #ifdef KMP_GOMP_COMPAT | |||
1081 | ||||
1082 | template <typename UT> | |||
1083 | static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) { | |||
1084 | typedef typename traits_t<UT>::signed_t ST; | |||
1085 | __kmp_assert_valid_gtid(gtid); | |||
1086 | kmp_info_t *th = __kmp_threads[gtid]; | |||
1087 | ||||
1088 | KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_finish_chunk: T#%d called\n" , gtid); }; | |||
1089 | if (!th->th.th_team->t.t_serialized) { | |||
1090 | dispatch_private_info_template<UT> *pr = | |||
1091 | reinterpret_cast<dispatch_private_info_template<UT> *>( | |||
1092 | th->th.th_dispatch->th_dispatch_pr_current); | |||
1093 | dispatch_shared_info_template<UT> volatile *sh = | |||
1094 | reinterpret_cast<dispatch_shared_info_template<UT> volatile *>( | |||
1095 | th->th.th_dispatch->th_dispatch_sh_current); | |||
1096 | KMP_DEBUG_ASSERT(pr)if (!(pr)) { __kmp_debug_assert("pr", "openmp/runtime/src/kmp_dispatch.cpp" , 1096); }; | |||
1097 | KMP_DEBUG_ASSERT(sh)if (!(sh)) { __kmp_debug_assert("sh", "openmp/runtime/src/kmp_dispatch.cpp" , 1097); }; | |||
1098 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 1099); } | |||
1099 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 1099); }; | |||
1100 | ||||
1101 | UT lower = pr->u.p.ordered_lower; | |||
1102 | UT upper = pr->u.p.ordered_upper; | |||
1103 | UT inc = upper - lower + 1; | |||
1104 | ||||
1105 | if (pr->ordered_bumped == inc) { | |||
1106 | KD_TRACE(if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); } | |||
1107 | 1000,if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); } | |||
1108 | ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); } | |||
1109 | gtid))if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n" , gtid); }; | |||
1110 | pr->ordered_bumped = 0; | |||
1111 | } else { | |||
1112 | inc -= pr->ordered_bumped; | |||
1113 | ||||
1114 | #ifdef KMP_DEBUG1 | |||
1115 | { | |||
1116 | char *buff; | |||
1117 | // create format specifiers before the debug output | |||
1118 | buff = __kmp_str_format( | |||
1119 | "__kmp_dispatch_finish_chunk: T#%%d before wait: " | |||
1120 | "ordered_iteration:%%%s lower:%%%s upper:%%%s\n", | |||
1121 | traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec); | |||
1122 | KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper))if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, sh->u.s.ordered_iteration, lower, upper); }; | |||
1123 | __kmp_str_free(&buff); | |||
1124 | } | |||
1125 | #endif | |||
1126 | ||||
1127 | __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower, | |||
1128 | __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL), __null); | |||
1129 | ||||
1130 | KMP_MB(); /* is this necessary? */ | |||
1131 | KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish_chunk: T#%d resetting " "ordered_bumped to zero\n", gtid); } | |||
1132 | "ordered_bumped to zero\n",if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish_chunk: T#%d resetting " "ordered_bumped to zero\n", gtid); } | |||
1133 | gtid))if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_finish_chunk: T#%d resetting " "ordered_bumped to zero\n", gtid); }; | |||
1134 | pr->ordered_bumped = 0; | |||
1135 | //!!!!! TODO check if the inc should be unsigned, or signed??? | |||
1136 | #ifdef KMP_DEBUG1 | |||
1137 | { | |||
1138 | char *buff; | |||
1139 | // create format specifiers before the debug output | |||
1140 | buff = __kmp_str_format( | |||
1141 | "__kmp_dispatch_finish_chunk: T#%%d after wait: " | |||
1142 | "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n", | |||
1143 | traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec, | |||
1144 | traits_t<UT>::spec); | |||
1145 | KD_TRACE(1000,if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper); } | |||
1146 | (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper))if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper); }; | |||
1147 | __kmp_str_free(&buff); | |||
1148 | } | |||
1149 | #endif | |||
1150 | ||||
1151 | test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc); | |||
1152 | } | |||
1153 | // } | |||
1154 | } | |||
1155 | KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_finish_chunk: T#%d returned\n" , gtid); }; | |||
1156 | } | |||
1157 | ||||
1158 | #endif /* KMP_GOMP_COMPAT */ | |||
1159 | ||||
1160 | template <typename T> | |||
1161 | int __kmp_dispatch_next_algorithm(int gtid, | |||
1162 | dispatch_private_info_template<T> *pr, | |||
1163 | dispatch_shared_info_template<T> volatile *sh, | |||
1164 | kmp_int32 *p_last, T *p_lb, T *p_ub, | |||
1165 | typename traits_t<T>::signed_t *p_st, T nproc, | |||
1166 | T tid) { | |||
1167 | typedef typename traits_t<T>::unsigned_t UT; | |||
1168 | typedef typename traits_t<T>::signed_t ST; | |||
1169 | typedef typename traits_t<T>::floating_t DBL; | |||
1170 | int status = 0; | |||
1171 | bool last = false; | |||
1172 | T start; | |||
1173 | ST incr; | |||
1174 | UT limit, trip, init; | |||
1175 | kmp_info_t *th = __kmp_threads[gtid]; | |||
1176 | kmp_team_t *team = th->th.th_team; | |||
1177 | ||||
1178 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 1179); } | |||
| ||||
1179 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 1179); }; | |||
1180 | KMP_DEBUG_ASSERT(pr)if (!(pr)) { __kmp_debug_assert("pr", "openmp/runtime/src/kmp_dispatch.cpp" , 1180); }; | |||
1181 | KMP_DEBUG_ASSERT(sh)if (!(sh)) { __kmp_debug_assert("sh", "openmp/runtime/src/kmp_dispatch.cpp" , 1181); }; | |||
1182 | KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc)if (!(tid >= 0 && tid < nproc)) { __kmp_debug_assert ("tid >= 0 && tid < nproc", "openmp/runtime/src/kmp_dispatch.cpp" , 1182); }; | |||
1183 | #ifdef KMP_DEBUG1 | |||
1184 | { | |||
1185 | char *buff; | |||
1186 | // create format specifiers before the debug output | |||
1187 | buff = | |||
1188 | __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " | |||
1189 | "sh:%%p nproc:%%%s tid:%%%s\n", | |||
1190 | traits_t<T>::spec, traits_t<T>::spec); | |||
1191 | KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, pr , sh, nproc, tid); }; | |||
1192 | __kmp_str_free(&buff); | |||
1193 | } | |||
1194 | #endif | |||
1195 | ||||
1196 | // zero trip count | |||
1197 | if (pr->u.p.tc == 0) { | |||
1198 | KD_TRACE(10,if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " "zero status:%d\n", gtid, status); } | |||
1199 | ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " "zero status:%d\n", gtid, status); } | |||
1200 | "zero status:%d\n",if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " "zero status:%d\n", gtid, status); } | |||
1201 | gtid, status))if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is " "zero status:%d\n", gtid, status); }; | |||
1202 | return 0; | |||
1203 | } | |||
1204 | ||||
1205 | switch (pr->schedule) { | |||
1206 | #if KMP_STATIC_STEAL_ENABLED1 | |||
1207 | case kmp_sch_static_steal: { | |||
1208 | T chunk = pr->u.p.parm1; | |||
1209 | UT nchunks = pr->u.p.parm2; | |||
1210 | KD_TRACE(100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n" , gtid); } | |||
1211 | ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n" , gtid); } | |||
1212 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n" , gtid); }; | |||
1213 | ||||
1214 | trip = pr->u.p.tc - 1; | |||
1215 | ||||
1216 | if (traits_t<T>::type_size > 4) { | |||
1217 | // use lock for 8-byte induction variable. | |||
1218 | // TODO (optional): check presence and use 16-byte CAS | |||
1219 | kmp_lock_t *lck = pr->u.p.steal_lock; | |||
1220 | KMP_DEBUG_ASSERT(lck != NULL)if (!(lck != __null)) { __kmp_debug_assert("lck != __null", "openmp/runtime/src/kmp_dispatch.cpp" , 1220); }; | |||
1221 | if (pr->u.p.count < (UT)pr->u.p.ub) { | |||
1222 | KMP_DEBUG_ASSERT(pr->steal_flag == READY)if (!(pr->steal_flag == READY)) { __kmp_debug_assert("pr->steal_flag == READY" , "openmp/runtime/src/kmp_dispatch.cpp", 1222); }; | |||
1223 | __kmp_acquire_lock(lck, gtid); | |||
1224 | // try to get own chunk of iterations | |||
1225 | init = (pr->u.p.count)++; | |||
1226 | status = (init < (UT)pr->u.p.ub); | |||
1227 | __kmp_release_lock(lck, gtid); | |||
1228 | } else { | |||
1229 | status = 0; // no own chunks | |||
1230 | } | |||
1231 | if (!status) { // try to steal | |||
1232 | kmp_lock_t *lckv; // victim buffer's lock | |||
1233 | T while_limit = pr->u.p.parm3; | |||
1234 | T while_index = 0; | |||
1235 | int idx = (th->th.th_dispatch->th_disp_index - 1) % | |||
1236 | __kmp_dispatch_num_buffers; // current loop index | |||
1237 | // note: victim thread can potentially execute another loop | |||
1238 | KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF)(&pr->steal_flag)->store(THIEF, std::memory_order_release ); // mark self buffer inactive | |||
1239 | while ((!status) && (while_limit != ++while_index)) { | |||
1240 | dispatch_private_info_template<T> *v; | |||
1241 | T remaining; | |||
1242 | T victimId = pr->u.p.parm4; | |||
1243 | T oldVictimId = victimId ? victimId - 1 : nproc - 1; | |||
1244 | v = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
1245 | &team->t.t_dispatch[victimId].th_disp_buffer[idx]); | |||
1246 | KMP_DEBUG_ASSERT(v)if (!(v)) { __kmp_debug_assert("v", "openmp/runtime/src/kmp_dispatch.cpp" , 1246); }; | |||
1247 | while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_relaxed) == THIEF) && | |||
1248 | oldVictimId != victimId) { | |||
1249 | victimId = (victimId + 1) % nproc; | |||
1250 | v = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
1251 | &team->t.t_dispatch[victimId].th_disp_buffer[idx]); | |||
1252 | KMP_DEBUG_ASSERT(v)if (!(v)) { __kmp_debug_assert("v", "openmp/runtime/src/kmp_dispatch.cpp" , 1252); }; | |||
1253 | } | |||
1254 | if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_relaxed) == THIEF) { | |||
1255 | continue; // try once more (nproc attempts in total) | |||
1256 | } | |||
1257 | if (KMP_ATOMIC_LD_RLX(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_relaxed) == UNUSED) { | |||
1258 | kmp_uint32 old = UNUSED; | |||
1259 | // try to steal whole range from inactive victim | |||
1260 | status = v->steal_flag.compare_exchange_strong(old, THIEF); | |||
1261 | if (status) { | |||
1262 | // initialize self buffer with victim's whole range of chunks | |||
1263 | T id = victimId; | |||
1264 | T small_chunk, extras; | |||
1265 | small_chunk = nchunks / nproc; // chunks per thread | |||
1266 | extras = nchunks % nproc; | |||
1267 | init = id * small_chunk + (id < extras ? id : extras); | |||
1268 | __kmp_acquire_lock(lck, gtid); | |||
1269 | pr->u.p.count = init + 1; // exclude one we execute immediately | |||
1270 | pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); | |||
1271 | __kmp_release_lock(lck, gtid); | |||
1272 | pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid | |||
1273 | // no need to reinitialize other thread invariants: lb, st, etc. | |||
1274 | #ifdef KMP_DEBUG1 | |||
1275 | { | |||
1276 | char *buff; | |||
1277 | // create format specifiers before the debug output | |||
1278 | buff = __kmp_str_format( | |||
1279 | "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " | |||
1280 | "count:%%%s ub:%%%s\n", | |||
1281 | traits_t<UT>::spec, traits_t<T>::spec); | |||
1282 | KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, id , pr->u.p.count, pr->u.p.ub); }; | |||
1283 | __kmp_str_free(&buff); | |||
1284 | } | |||
1285 | #endif | |||
1286 | // activate non-empty buffer and let others steal from us | |||
1287 | if (pr->u.p.count < (UT)pr->u.p.ub) | |||
1288 | KMP_ATOMIC_ST_REL(&pr->steal_flag, READY)(&pr->steal_flag)->store(READY, std::memory_order_release ); | |||
1289 | break; | |||
1290 | } | |||
1291 | } | |||
1292 | if (KMP_ATOMIC_LD_ACQ(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_acquire) != READY || | |||
1293 | v->u.p.count >= (UT)v->u.p.ub) { | |||
1294 | pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid | |||
1295 | continue; // no chunks to steal, try next victim | |||
1296 | } | |||
1297 | lckv = v->u.p.steal_lock; | |||
1298 | KMP_ASSERT(lckv != NULL)if (!(lckv != __null)) { __kmp_debug_assert("lckv != NULL", "openmp/runtime/src/kmp_dispatch.cpp" , 1298); }; | |||
1299 | __kmp_acquire_lock(lckv, gtid); | |||
1300 | limit = v->u.p.ub; // keep initial ub | |||
1301 | if (v->u.p.count >= limit) { | |||
1302 | __kmp_release_lock(lckv, gtid); | |||
1303 | pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid | |||
1304 | continue; // no chunks to steal, try next victim | |||
1305 | } | |||
1306 | ||||
1307 | // stealing succeded, reduce victim's ub by 1/4 of undone chunks | |||
1308 | // TODO: is this heuristics good enough?? | |||
1309 | remaining = limit - v->u.p.count; | |||
1310 | if (remaining > 7) { | |||
1311 | // steal 1/4 of remaining | |||
1312 | KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2)((void)0); | |||
1313 | init = (v->u.p.ub -= (remaining >> 2)); | |||
1314 | } else { | |||
1315 | // steal 1 chunk of 1..7 remaining | |||
1316 | KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1)((void)0); | |||
1317 | init = (v->u.p.ub -= 1); | |||
1318 | } | |||
1319 | __kmp_release_lock(lckv, gtid); | |||
1320 | #ifdef KMP_DEBUG1 | |||
1321 | { | |||
1322 | char *buff; | |||
1323 | // create format specifiers before the debug output | |||
1324 | buff = __kmp_str_format( | |||
1325 | "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " | |||
1326 | "count:%%%s ub:%%%s\n", | |||
1327 | traits_t<UT>::spec, traits_t<UT>::spec); | |||
1328 | KD_TRACE(10, (buff, gtid, victimId, init, limit))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, victimId , init, limit); }; | |||
1329 | __kmp_str_free(&buff); | |||
1330 | } | |||
1331 | #endif | |||
1332 | KMP_DEBUG_ASSERT(init + 1 <= limit)if (!(init + 1 <= limit)) { __kmp_debug_assert("init + 1 <= limit" , "openmp/runtime/src/kmp_dispatch.cpp", 1332); }; | |||
1333 | pr->u.p.parm4 = victimId; // remember victim to steal from | |||
1334 | status = 1; | |||
1335 | // now update own count and ub with stolen range excluding init chunk | |||
1336 | __kmp_acquire_lock(lck, gtid); | |||
1337 | pr->u.p.count = init + 1; | |||
1338 | pr->u.p.ub = limit; | |||
1339 | __kmp_release_lock(lck, gtid); | |||
1340 | // activate non-empty buffer and let others steal from us | |||
1341 | if (init + 1 < limit) | |||
1342 | KMP_ATOMIC_ST_REL(&pr->steal_flag, READY)(&pr->steal_flag)->store(READY, std::memory_order_release ); | |||
1343 | } // while (search for victim) | |||
1344 | } // if (try to find victim and steal) | |||
1345 | } else { | |||
1346 | // 4-byte induction variable, use 8-byte CAS for pair (count, ub) | |||
1347 | // as all operations on pair (count, ub) must be done atomically | |||
1348 | typedef union { | |||
1349 | struct { | |||
1350 | UT count; | |||
1351 | T ub; | |||
1352 | } p; | |||
1353 | kmp_int64 b; | |||
1354 | } union_i4; | |||
1355 | union_i4 vold, vnew; | |||
1356 | if (pr->u.p.count < (UT)pr->u.p.ub) { | |||
1357 | KMP_DEBUG_ASSERT(pr->steal_flag == READY)if (!(pr->steal_flag == READY)) { __kmp_debug_assert("pr->steal_flag == READY" , "openmp/runtime/src/kmp_dispatch.cpp", 1357); }; | |||
1358 | vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); | |||
1359 | vnew.b = vold.b; | |||
1360 | vnew.p.count++; // get chunk from head of self range | |||
1361 | while (!KMP_COMPARE_AND_STORE_REL64(__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&pr->u.p.count), (kmp_uint64)(*(kmp_int64 *) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b)) | |||
1362 | (volatile kmp_int64 *)&pr->u.p.count,__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&pr->u.p.count), (kmp_uint64)(*(kmp_int64 *) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b)) | |||
1363 | *VOLATILE_CAST(kmp_int64 *) & vold.b,__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&pr->u.p.count), (kmp_uint64)(*(kmp_int64 *) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b)) | |||
1364 | *VOLATILE_CAST(kmp_int64 *) & vnew.b)__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&pr->u.p.count), (kmp_uint64)(*(kmp_int64 *) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b))) { | |||
1365 | KMP_CPU_PAUSE()__kmp_x86_pause(); | |||
1366 | vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); | |||
1367 | vnew.b = vold.b; | |||
1368 | vnew.p.count++; | |||
1369 | } | |||
1370 | init = vold.p.count; | |||
1371 | status = (init < (UT)vold.p.ub); | |||
1372 | } else { | |||
1373 | status = 0; // no own chunks | |||
1374 | } | |||
1375 | if (!status) { // try to steal | |||
1376 | T while_limit = pr->u.p.parm3; | |||
1377 | T while_index = 0; | |||
1378 | int idx = (th->th.th_dispatch->th_disp_index - 1) % | |||
1379 | __kmp_dispatch_num_buffers; // current loop index | |||
1380 | // note: victim thread can potentially execute another loop | |||
1381 | KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF)(&pr->steal_flag)->store(THIEF, std::memory_order_release ); // mark self buffer inactive | |||
1382 | while ((!status) && (while_limit != ++while_index)) { | |||
1383 | dispatch_private_info_template<T> *v; | |||
1384 | T remaining; | |||
1385 | T victimId = pr->u.p.parm4; | |||
1386 | T oldVictimId = victimId ? victimId - 1 : nproc - 1; | |||
1387 | v = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
1388 | &team->t.t_dispatch[victimId].th_disp_buffer[idx]); | |||
1389 | KMP_DEBUG_ASSERT(v)if (!(v)) { __kmp_debug_assert("v", "openmp/runtime/src/kmp_dispatch.cpp" , 1389); }; | |||
1390 | while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_relaxed) == THIEF) && | |||
1391 | oldVictimId != victimId) { | |||
1392 | victimId = (victimId + 1) % nproc; | |||
1393 | v = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
1394 | &team->t.t_dispatch[victimId].th_disp_buffer[idx]); | |||
1395 | KMP_DEBUG_ASSERT(v)if (!(v)) { __kmp_debug_assert("v", "openmp/runtime/src/kmp_dispatch.cpp" , 1395); }; | |||
1396 | } | |||
1397 | if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_relaxed) == THIEF) { | |||
1398 | continue; // try once more (nproc attempts in total) | |||
1399 | } | |||
1400 | if (KMP_ATOMIC_LD_RLX(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_relaxed) == UNUSED) { | |||
1401 | kmp_uint32 old = UNUSED; | |||
1402 | // try to steal whole range from inactive victim | |||
1403 | status = v->steal_flag.compare_exchange_strong(old, THIEF); | |||
1404 | if (status) { | |||
1405 | // initialize self buffer with victim's whole range of chunks | |||
1406 | T id = victimId; | |||
1407 | T small_chunk, extras; | |||
1408 | small_chunk = nchunks / nproc; // chunks per thread | |||
1409 | extras = nchunks % nproc; | |||
1410 | init = id * small_chunk + (id < extras ? id : extras); | |||
1411 | vnew.p.count = init + 1; | |||
1412 | vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0); | |||
1413 | // write pair (count, ub) at once atomically | |||
1414 | #if KMP_ARCH_X860 | |||
1415 | KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b)__sync_lock_test_and_set((volatile kmp_uint64 *)((volatile kmp_int64 *)(&pr->u.p.count)), (kmp_uint64)(vnew.b)); | |||
1416 | #else | |||
1417 | *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b; | |||
1418 | #endif | |||
1419 | pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid | |||
1420 | // no need to initialize other thread invariants: lb, st, etc. | |||
1421 | #ifdef KMP_DEBUG1 | |||
1422 | { | |||
1423 | char *buff; | |||
1424 | // create format specifiers before the debug output | |||
1425 | buff = __kmp_str_format( | |||
1426 | "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " | |||
1427 | "count:%%%s ub:%%%s\n", | |||
1428 | traits_t<UT>::spec, traits_t<T>::spec); | |||
1429 | KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, id , pr->u.p.count, pr->u.p.ub); }; | |||
1430 | __kmp_str_free(&buff); | |||
1431 | } | |||
1432 | #endif | |||
1433 | // activate non-empty buffer and let others steal from us | |||
1434 | if (pr->u.p.count < (UT)pr->u.p.ub) | |||
1435 | KMP_ATOMIC_ST_REL(&pr->steal_flag, READY)(&pr->steal_flag)->store(READY, std::memory_order_release ); | |||
1436 | break; | |||
1437 | } | |||
1438 | } | |||
1439 | while (1) { // CAS loop with check if victim still has enough chunks | |||
1440 | // many threads may be stealing concurrently from same victim | |||
1441 | vold.b = *(volatile kmp_int64 *)(&v->u.p.count); | |||
1442 | if (KMP_ATOMIC_LD_ACQ(&v->steal_flag)(&v->steal_flag)->load(std::memory_order_acquire) != READY || | |||
1443 | vold.p.count >= (UT)vold.p.ub) { | |||
1444 | pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id | |||
1445 | break; // no chunks to steal, try next victim | |||
1446 | } | |||
1447 | vnew.b = vold.b; | |||
1448 | remaining = vold.p.ub - vold.p.count; | |||
1449 | // try to steal 1/4 of remaining | |||
1450 | // TODO: is this heuristics good enough?? | |||
1451 | if (remaining > 7) { | |||
1452 | vnew.p.ub -= remaining >> 2; // steal from tail of victim's range | |||
1453 | } else { | |||
1454 | vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining | |||
1455 | } | |||
1456 | KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip)if (!(vnew.p.ub * (UT)chunk <= trip)) { __kmp_debug_assert ("vnew.p.ub * (UT)chunk <= trip", "openmp/runtime/src/kmp_dispatch.cpp" , 1456); }; | |||
1457 | if (KMP_COMPARE_AND_STORE_REL64(__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&v->u.p.count), (kmp_uint64)(*(kmp_int64 * ) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b)) | |||
1458 | (volatile kmp_int64 *)&v->u.p.count,__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&v->u.p.count), (kmp_uint64)(*(kmp_int64 * ) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b)) | |||
1459 | *VOLATILE_CAST(kmp_int64 *) & vold.b,__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&v->u.p.count), (kmp_uint64)(*(kmp_int64 * ) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b)) | |||
1460 | *VOLATILE_CAST(kmp_int64 *) & vnew.b)__sync_bool_compare_and_swap((volatile kmp_uint64 *)((volatile kmp_int64 *)&v->u.p.count), (kmp_uint64)(*(kmp_int64 * ) & vold.b), (kmp_uint64)(*(kmp_int64 *) & vnew.b))) { | |||
1461 | // stealing succedded | |||
1462 | #ifdef KMP_DEBUG1 | |||
1463 | { | |||
1464 | char *buff; | |||
1465 | // create format specifiers before the debug output | |||
1466 | buff = __kmp_str_format( | |||
1467 | "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " | |||
1468 | "count:%%%s ub:%%%s\n", | |||
1469 | traits_t<T>::spec, traits_t<T>::spec); | |||
1470 | KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, victimId , vnew.p.ub, vold.p.ub); }; | |||
1471 | __kmp_str_free(&buff); | |||
1472 | } | |||
1473 | #endif | |||
1474 | KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,((void)0) | |||
1475 | vold.p.ub - vnew.p.ub)((void)0); | |||
1476 | status = 1; | |||
1477 | pr->u.p.parm4 = victimId; // keep victim id | |||
1478 | // now update own count and ub | |||
1479 | init = vnew.p.ub; | |||
1480 | vold.p.count = init + 1; | |||
1481 | #if KMP_ARCH_X860 | |||
1482 | KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b)__sync_lock_test_and_set((volatile kmp_uint64 *)((volatile kmp_int64 *)(&pr->u.p.count)), (kmp_uint64)(vold.b)); | |||
1483 | #else | |||
1484 | *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; | |||
1485 | #endif | |||
1486 | // activate non-empty buffer and let others steal from us | |||
1487 | if (vold.p.count < (UT)vold.p.ub) | |||
1488 | KMP_ATOMIC_ST_REL(&pr->steal_flag, READY)(&pr->steal_flag)->store(READY, std::memory_order_release ); | |||
1489 | break; | |||
1490 | } // if (check CAS result) | |||
1491 | KMP_CPU_PAUSE()__kmp_x86_pause(); // CAS failed, repeatedly attempt | |||
1492 | } // while (try to steal from particular victim) | |||
1493 | } // while (search for victim) | |||
1494 | } // if (try to find victim and steal) | |||
1495 | } // if (4-byte induction variable) | |||
1496 | if (!status) { | |||
1497 | *p_lb = 0; | |||
1498 | *p_ub = 0; | |||
1499 | if (p_st != NULL__null) | |||
1500 | *p_st = 0; | |||
1501 | } else { | |||
1502 | start = pr->u.p.lb; | |||
1503 | init *= chunk; | |||
1504 | limit = chunk + init - 1; | |||
1505 | incr = pr->u.p.st; | |||
1506 | KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1)((void)0); | |||
1507 | ||||
1508 | KMP_DEBUG_ASSERT(init <= trip)if (!(init <= trip)) { __kmp_debug_assert("init <= trip" , "openmp/runtime/src/kmp_dispatch.cpp", 1508); }; | |||
1509 | // keep track of done chunks for possible early exit from stealing | |||
1510 | // TODO: count executed chunks locally with rare update of shared location | |||
1511 | // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); | |||
1512 | if ((last = (limit >= trip)) != 0) | |||
1513 | limit = trip; | |||
1514 | if (p_st != NULL__null) | |||
1515 | *p_st = incr; | |||
1516 | ||||
1517 | if (incr == 1) { | |||
1518 | *p_lb = start + init; | |||
1519 | *p_ub = start + limit; | |||
1520 | } else { | |||
1521 | *p_lb = start + init * incr; | |||
1522 | *p_ub = start + limit * incr; | |||
1523 | } | |||
1524 | } // if | |||
1525 | break; | |||
1526 | } // case | |||
1527 | #endif // KMP_STATIC_STEAL_ENABLED | |||
1528 | case kmp_sch_static_balanced: { | |||
1529 | KD_TRACE(if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); } | |||
1530 | 10,if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); } | |||
1531 | ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); } | |||
1532 | gtid))if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n" , gtid); }; | |||
1533 | /* check if thread has any iteration to do */ | |||
1534 | if ((status = !pr->u.p.count) != 0) { | |||
1535 | pr->u.p.count = 1; | |||
1536 | *p_lb = pr->u.p.lb; | |||
1537 | *p_ub = pr->u.p.ub; | |||
1538 | last = (pr->u.p.parm1 != 0); | |||
1539 | if (p_st != NULL__null) | |||
1540 | *p_st = pr->u.p.st; | |||
1541 | } else { /* no iterations to do */ | |||
1542 | pr->u.p.lb = pr->u.p.ub + pr->u.p.st; | |||
1543 | } | |||
1544 | } // case | |||
1545 | break; | |||
1546 | case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was | |||
1547 | merged here */ | |||
1548 | case kmp_sch_static_chunked: { | |||
1549 | T parm1; | |||
1550 | ||||
1551 | KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d " "kmp_sch_static_[affinity|chunked] case\n", gtid); } | |||
1552 | "kmp_sch_static_[affinity|chunked] case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d " "kmp_sch_static_[affinity|chunked] case\n", gtid); } | |||
1553 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d " "kmp_sch_static_[affinity|chunked] case\n", gtid); }; | |||
1554 | parm1 = pr->u.p.parm1; | |||
1555 | ||||
1556 | trip = pr->u.p.tc - 1; | |||
1557 | init = parm1 * (pr->u.p.count + tid); | |||
1558 | ||||
1559 | if ((status = (init <= trip)) != 0) { | |||
1560 | start = pr->u.p.lb; | |||
1561 | incr = pr->u.p.st; | |||
1562 | limit = parm1 + init - 1; | |||
1563 | ||||
1564 | if ((last = (limit >= trip)) != 0) | |||
1565 | limit = trip; | |||
1566 | ||||
1567 | if (p_st != NULL__null) | |||
1568 | *p_st = incr; | |||
1569 | ||||
1570 | pr->u.p.count += nproc; | |||
1571 | ||||
1572 | if (incr == 1) { | |||
1573 | *p_lb = start + init; | |||
1574 | *p_ub = start + limit; | |||
1575 | } else { | |||
1576 | *p_lb = start + init * incr; | |||
1577 | *p_ub = start + limit * incr; | |||
1578 | } | |||
1579 | ||||
1580 | if (pr->flags.ordered) { | |||
1581 | pr->u.p.ordered_lower = init; | |||
1582 | pr->u.p.ordered_upper = limit; | |||
1583 | } // if | |||
1584 | } // if | |||
1585 | } // case | |||
1586 | break; | |||
1587 | ||||
1588 | case kmp_sch_dynamic_chunked: { | |||
1589 | UT chunk_number; | |||
1590 | UT chunk_size = pr->u.p.parm1; | |||
1591 | UT nchunks = pr->u.p.parm2; | |||
1592 | ||||
1593 | KD_TRACE(if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n" , gtid); } | |||
1594 | 100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n" , gtid); } | |||
1595 | ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n" , gtid); } | |||
1596 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n" , gtid); }; | |||
1597 | ||||
1598 | chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); | |||
1599 | status = (chunk_number < nchunks); | |||
1600 | if (!status) { | |||
1601 | *p_lb = 0; | |||
1602 | *p_ub = 0; | |||
1603 | if (p_st != NULL__null) | |||
1604 | *p_st = 0; | |||
1605 | } else { | |||
1606 | init = chunk_size * chunk_number; | |||
1607 | trip = pr->u.p.tc - 1; | |||
1608 | start = pr->u.p.lb; | |||
1609 | incr = pr->u.p.st; | |||
1610 | ||||
1611 | if ((last = (trip - init < (UT)chunk_size))) | |||
1612 | limit = trip; | |||
1613 | else | |||
1614 | limit = chunk_size + init - 1; | |||
1615 | ||||
1616 | if (p_st != NULL__null) | |||
1617 | *p_st = incr; | |||
1618 | ||||
1619 | if (incr == 1) { | |||
1620 | *p_lb = start + init; | |||
1621 | *p_ub = start + limit; | |||
1622 | } else { | |||
1623 | *p_lb = start + init * incr; | |||
1624 | *p_ub = start + limit * incr; | |||
1625 | } | |||
1626 | ||||
1627 | if (pr->flags.ordered) { | |||
1628 | pr->u.p.ordered_lower = init; | |||
1629 | pr->u.p.ordered_upper = limit; | |||
1630 | } // if | |||
1631 | } // if | |||
1632 | } // case | |||
1633 | break; | |||
1634 | ||||
1635 | case kmp_sch_guided_iterative_chunked: { | |||
1636 | T chunkspec = pr->u.p.parm1; | |||
1637 | KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " "iterative case\n", gtid); } | |||
1638 | "iterative case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " "iterative case\n", gtid); } | |||
1639 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " "iterative case\n", gtid); }; | |||
1640 | trip = pr->u.p.tc; | |||
1641 | // Start atomic part of calculations | |||
1642 | while (1) { | |||
1643 | ST remaining; // signed, because can be < 0 | |||
1644 | init = sh->u.s.iteration; // shared value | |||
1645 | remaining = trip - init; | |||
1646 | if (remaining <= 0) { // AC: need to compare with 0 first | |||
1647 | // nothing to do, don't try atomic op | |||
1648 | status = 0; | |||
1649 | break; | |||
1650 | } | |||
1651 | if ((T)remaining < | |||
1652 | pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default | |||
1653 | // use dynamic-style schedule | |||
1654 | // atomically increment iterations, get old value | |||
1655 | init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration)reinterpret_cast<volatile ST *>(&sh->u.s.iteration ), | |||
1656 | (ST)chunkspec); | |||
1657 | remaining = trip - init; | |||
1658 | if (remaining <= 0) { | |||
1659 | status = 0; // all iterations got by other threads | |||
1660 | } else { | |||
1661 | // got some iterations to work on | |||
1662 | status = 1; | |||
1663 | if ((T)remaining > chunkspec) { | |||
1664 | limit = init + chunkspec - 1; | |||
1665 | } else { | |||
1666 | last = true; // the last chunk | |||
1667 | limit = init + remaining - 1; | |||
1668 | } // if | |||
1669 | } // if | |||
1670 | break; | |||
1671 | } // if | |||
1672 | limit = init + (UT)((double)remaining * | |||
1673 | *(double *)&pr->u.p.parm3); // divide by K*nproc | |||
1674 | if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration)reinterpret_cast<volatile ST *>(&sh->u.s.iteration ), | |||
1675 | (ST)init, (ST)limit)) { | |||
1676 | // CAS was successful, chunk obtained | |||
1677 | status = 1; | |||
1678 | --limit; | |||
1679 | break; | |||
1680 | } // if | |||
1681 | } // while | |||
1682 | if (status != 0) { | |||
1683 | start = pr->u.p.lb; | |||
1684 | incr = pr->u.p.st; | |||
1685 | if (p_st != NULL__null) | |||
1686 | *p_st = incr; | |||
1687 | *p_lb = start + init * incr; | |||
1688 | *p_ub = start + limit * incr; | |||
1689 | if (pr->flags.ordered) { | |||
1690 | pr->u.p.ordered_lower = init; | |||
1691 | pr->u.p.ordered_upper = limit; | |||
1692 | } // if | |||
1693 | } else { | |||
1694 | *p_lb = 0; | |||
1695 | *p_ub = 0; | |||
1696 | if (p_st != NULL__null) | |||
1697 | *p_st = 0; | |||
1698 | } // if | |||
1699 | } // case | |||
1700 | break; | |||
1701 | ||||
1702 | case kmp_sch_guided_simd: { | |||
1703 | // same as iterative but curr-chunk adjusted to be multiple of given | |||
1704 | // chunk | |||
1705 | T chunk = pr->u.p.parm1; | |||
1706 | KD_TRACE(100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n" , gtid); } | |||
1707 | ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n" , gtid); } | |||
1708 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n" , gtid); }; | |||
1709 | trip = pr->u.p.tc; | |||
1710 | // Start atomic part of calculations | |||
1711 | while (1) { | |||
1712 | ST remaining; // signed, because can be < 0 | |||
1713 | init = sh->u.s.iteration; // shared value | |||
1714 | remaining = trip - init; | |||
1715 | if (remaining <= 0) { // AC: need to compare with 0 first | |||
1716 | status = 0; // nothing to do, don't try atomic op | |||
1717 | break; | |||
1718 | } | |||
1719 | KMP_DEBUG_ASSERT(chunk && init % chunk == 0)if (!(chunk && init % chunk == 0)) { __kmp_debug_assert ("chunk && init % chunk == 0", "openmp/runtime/src/kmp_dispatch.cpp" , 1719); }; | |||
1720 | // compare with K*nproc*(chunk+1), K=2 by default | |||
1721 | if ((T)remaining < pr->u.p.parm2) { | |||
1722 | // use dynamic-style schedule | |||
1723 | // atomically increment iterations, get old value | |||
1724 | init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration)reinterpret_cast<volatile ST *>(&sh->u.s.iteration ), | |||
1725 | (ST)chunk); | |||
1726 | remaining = trip - init; | |||
1727 | if (remaining <= 0) { | |||
1728 | status = 0; // all iterations got by other threads | |||
1729 | } else { | |||
1730 | // got some iterations to work on | |||
1731 | status = 1; | |||
1732 | if ((T)remaining > chunk) { | |||
1733 | limit = init + chunk - 1; | |||
1734 | } else { | |||
1735 | last = true; // the last chunk | |||
1736 | limit = init + remaining - 1; | |||
1737 | } // if | |||
1738 | } // if | |||
1739 | break; | |||
1740 | } // if | |||
1741 | // divide by K*nproc | |||
1742 | UT span; | |||
1743 | __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3), | |||
1744 | &span); | |||
1745 | UT rem = span % chunk; | |||
1746 | if (rem) // adjust so that span%chunk == 0 | |||
1747 | span += chunk - rem; | |||
1748 | limit = init + span; | |||
1749 | if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration)reinterpret_cast<volatile ST *>(&sh->u.s.iteration ), | |||
1750 | (ST)init, (ST)limit)) { | |||
1751 | // CAS was successful, chunk obtained | |||
1752 | status = 1; | |||
1753 | --limit; | |||
1754 | break; | |||
1755 | } // if | |||
1756 | } // while | |||
1757 | if (status != 0) { | |||
1758 | start = pr->u.p.lb; | |||
1759 | incr = pr->u.p.st; | |||
1760 | if (p_st != NULL__null) | |||
1761 | *p_st = incr; | |||
1762 | *p_lb = start + init * incr; | |||
1763 | *p_ub = start + limit * incr; | |||
1764 | if (pr->flags.ordered) { | |||
1765 | pr->u.p.ordered_lower = init; | |||
1766 | pr->u.p.ordered_upper = limit; | |||
1767 | } // if | |||
1768 | } else { | |||
1769 | *p_lb = 0; | |||
1770 | *p_ub = 0; | |||
1771 | if (p_st != NULL__null) | |||
1772 | *p_st = 0; | |||
1773 | } // if | |||
1774 | } // case | |||
1775 | break; | |||
1776 | ||||
1777 | case kmp_sch_guided_analytical_chunked: { | |||
1778 | T chunkspec = pr->u.p.parm1; | |||
1779 | UT chunkIdx; | |||
1780 | #if KMP_USE_X87CONTROL0 | |||
1781 | /* for storing original FPCW value for Windows* OS on | |||
1782 | IA-32 architecture 8-byte version */ | |||
1783 | unsigned int oldFpcw; | |||
1784 | unsigned int fpcwSet = 0; | |||
1785 | #endif | |||
1786 | KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d " "kmp_sch_guided_analytical_chunked case\n", gtid); } | |||
1787 | "kmp_sch_guided_analytical_chunked case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d " "kmp_sch_guided_analytical_chunked case\n", gtid); } | |||
1788 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d " "kmp_sch_guided_analytical_chunked case\n", gtid); }; | |||
1789 | ||||
1790 | trip = pr->u.p.tc; | |||
1791 | ||||
1792 | KMP_DEBUG_ASSERT(nproc > 1)if (!(nproc > 1)) { __kmp_debug_assert("nproc > 1", "openmp/runtime/src/kmp_dispatch.cpp" , 1792); }; | |||
1793 | KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip)if (!((2UL * chunkspec + 1) * (UT)nproc < trip)) { __kmp_debug_assert ("(2UL * chunkspec + 1) * (UT)nproc < trip", "openmp/runtime/src/kmp_dispatch.cpp" , 1793); }; | |||
1794 | ||||
1795 | while (1) { /* this while loop is a safeguard against unexpected zero | |||
1796 | chunk sizes */ | |||
1797 | chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration); | |||
1798 | if (chunkIdx >= (UT)pr->u.p.parm2) { | |||
1799 | --trip; | |||
1800 | /* use dynamic-style scheduling */ | |||
1801 | init = chunkIdx * chunkspec + pr->u.p.count; | |||
1802 | /* need to verify init > 0 in case of overflow in the above | |||
1803 | * calculation */ | |||
1804 | if ((status = (init > 0 && init <= trip)) != 0) { | |||
1805 | limit = init + chunkspec - 1; | |||
1806 | ||||
1807 | if ((last = (limit >= trip)) != 0) | |||
1808 | limit = trip; | |||
1809 | } | |||
1810 | break; | |||
1811 | } else { | |||
1812 | /* use exponential-style scheduling */ | |||
1813 | /* The following check is to workaround the lack of long double precision on | |||
1814 | Windows* OS. | |||
1815 | This check works around the possible effect that init != 0 for chunkIdx == 0. | |||
1816 | */ | |||
1817 | #if KMP_USE_X87CONTROL0 | |||
1818 | /* If we haven't already done so, save original | |||
1819 | FPCW and set precision to 64-bit, as Windows* OS | |||
1820 | on IA-32 architecture defaults to 53-bit */ | |||
1821 | if (!fpcwSet) { | |||
1822 | oldFpcw = _control87(0, 0); | |||
1823 | _control87(_PC_64, _MCW_PC); | |||
1824 | fpcwSet = 0x30000; | |||
1825 | } | |||
1826 | #endif | |||
1827 | if (chunkIdx) { | |||
1828 | init = __kmp_dispatch_guided_remaining<T>( | |||
1829 | trip, *(DBL *)&pr->u.p.parm3, chunkIdx); | |||
1830 | KMP_DEBUG_ASSERT(init)if (!(init)) { __kmp_debug_assert("init", "openmp/runtime/src/kmp_dispatch.cpp" , 1830); }; | |||
1831 | init = trip - init; | |||
1832 | } else | |||
1833 | init = 0; | |||
1834 | limit = trip - __kmp_dispatch_guided_remaining<T>( | |||
1835 | trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1); | |||
1836 | KMP_ASSERT(init <= limit)if (!(init <= limit)) { __kmp_debug_assert("init <= limit" , "openmp/runtime/src/kmp_dispatch.cpp", 1836); }; | |||
1837 | if (init < limit) { | |||
1838 | KMP_DEBUG_ASSERT(limit <= trip)if (!(limit <= trip)) { __kmp_debug_assert("limit <= trip" , "openmp/runtime/src/kmp_dispatch.cpp", 1838); }; | |||
1839 | --limit; | |||
1840 | status = 1; | |||
1841 | break; | |||
1842 | } // if | |||
1843 | } // if | |||
1844 | } // while (1) | |||
1845 | #if KMP_USE_X87CONTROL0 | |||
1846 | /* restore FPCW if necessary | |||
1847 | AC: check fpcwSet flag first because oldFpcw can be uninitialized here | |||
1848 | */ | |||
1849 | if (fpcwSet && (oldFpcw & fpcwSet)) | |||
1850 | _control87(oldFpcw, _MCW_PC); | |||
1851 | #endif | |||
1852 | if (status != 0) { | |||
1853 | start = pr->u.p.lb; | |||
1854 | incr = pr->u.p.st; | |||
1855 | if (p_st != NULL__null) | |||
1856 | *p_st = incr; | |||
1857 | *p_lb = start + init * incr; | |||
1858 | *p_ub = start + limit * incr; | |||
1859 | if (pr->flags.ordered) { | |||
1860 | pr->u.p.ordered_lower = init; | |||
1861 | pr->u.p.ordered_upper = limit; | |||
1862 | } | |||
1863 | } else { | |||
1864 | *p_lb = 0; | |||
1865 | *p_ub = 0; | |||
1866 | if (p_st != NULL__null) | |||
1867 | *p_st = 0; | |||
1868 | } | |||
1869 | } // case | |||
1870 | break; | |||
1871 | ||||
1872 | case kmp_sch_trapezoidal: { | |||
1873 | UT index; | |||
1874 | T parm2 = pr->u.p.parm2; | |||
1875 | T parm3 = pr->u.p.parm3; | |||
1876 | T parm4 = pr->u.p.parm4; | |||
1877 | KD_TRACE(100,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n" , gtid); } | |||
1878 | ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n" , gtid); } | |||
1879 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n" , gtid); }; | |||
1880 | ||||
1881 | index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration); | |||
1882 | ||||
1883 | init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2; | |||
1884 | trip = pr->u.p.tc - 1; | |||
1885 | ||||
1886 | if ((status = ((T)index < parm3 && init <= trip)) == 0) { | |||
1887 | *p_lb = 0; | |||
1888 | *p_ub = 0; | |||
1889 | if (p_st != NULL__null) | |||
1890 | *p_st = 0; | |||
1891 | } else { | |||
1892 | start = pr->u.p.lb; | |||
1893 | limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1; | |||
1894 | incr = pr->u.p.st; | |||
1895 | ||||
1896 | if ((last = (limit >= trip)) != 0) | |||
1897 | limit = trip; | |||
1898 | ||||
1899 | if (p_st != NULL__null) | |||
1900 | *p_st = incr; | |||
1901 | ||||
1902 | if (incr == 1) { | |||
1903 | *p_lb = start + init; | |||
1904 | *p_ub = start + limit; | |||
1905 | } else { | |||
1906 | *p_lb = start + init * incr; | |||
1907 | *p_ub = start + limit * incr; | |||
1908 | } | |||
1909 | ||||
1910 | if (pr->flags.ordered) { | |||
1911 | pr->u.p.ordered_lower = init; | |||
1912 | pr->u.p.ordered_upper = limit; | |||
1913 | } // if | |||
1914 | } // if | |||
1915 | } // case | |||
1916 | break; | |||
1917 | default: { | |||
1918 | status = 0; // to avoid complaints on uninitialized variable use | |||
1919 | __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected)__kmp_msg_format(kmp_i18n_msg_UnknownSchedTypeDetected), // Primary message | |||
1920 | KMP_HNT(GetNewerLibrary)__kmp_msg_format(kmp_i18n_hnt_GetNewerLibrary), // Hint | |||
1921 | __kmp_msg_null // Variadic argument list terminator | |||
1922 | ); | |||
1923 | } break; | |||
1924 | } // switch | |||
1925 | if (p_last) | |||
1926 | *p_last = last; | |||
1927 | #ifdef KMP_DEBUG1 | |||
1928 | if (pr->flags.ordered) { | |||
1929 | char *buff; | |||
1930 | // create format specifiers before the debug output | |||
1931 | buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " | |||
1932 | "ordered_lower:%%%s ordered_upper:%%%s\n", | |||
1933 | traits_t<UT>::spec, traits_t<UT>::spec); | |||
1934 | KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper))if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper); }; | |||
1935 | __kmp_str_free(&buff); | |||
1936 | } | |||
1937 | { | |||
1938 | char *buff; | |||
1939 | // create format specifiers before the debug output | |||
1940 | buff = __kmp_str_format( | |||
1941 | "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " | |||
1942 | "p_lb:%%%s p_ub:%%%s p_st:%%%s\n", | |||
1943 | traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); | |||
1944 | KMP_DEBUG_ASSERT(p_last)if (!(p_last)) { __kmp_debug_assert("p_last", "openmp/runtime/src/kmp_dispatch.cpp" , 1944); }; | |||
1945 | KMP_DEBUG_ASSERT(p_st)if (!(p_st)) { __kmp_debug_assert("p_st", "openmp/runtime/src/kmp_dispatch.cpp" , 1945); }; | |||
1946 | KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, status , *p_last, *p_lb, *p_ub, *p_st); }; | |||
| ||||
1947 | __kmp_str_free(&buff); | |||
1948 | } | |||
1949 | #endif | |||
1950 | return status; | |||
1951 | } | |||
1952 | ||||
1953 | /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more | |||
1954 | work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini() | |||
1955 | is not called. */ | |||
1956 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
1957 | #define OMPT_LOOP_ENDif (status == 0) { if (ompt_enabled.ompt_callback_work) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_callbacks. ompt_callback_work_callback( ompt_work_loop, ompt_scope_end, & (team_info->parallel_data), &(task_info->task_data) , 0, codeptr); } } \ | |||
1958 | if (status == 0) { \ | |||
1959 | if (ompt_enabled.ompt_callback_work) { \ | |||
1960 | ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL__null); \ | |||
1961 | ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ | |||
1962 | ompt_callbacks.ompt_callback(ompt_callback_work)ompt_callback_work_callback( \ | |||
1963 | ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ | |||
1964 | &(task_info->task_data), 0, codeptr); \ | |||
1965 | } \ | |||
1966 | } | |||
1967 | #define OMPT_LOOP_DISPATCH(lb, ub, st, status)if (ompt_enabled.ompt_callback_dispatch && status) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_dispatch_chunk_t chunk; ompt_data_t instance = {0}; do { if (st > 0) { chunk .start = static_cast<uint64_t>(lb); chunk.iterations = static_cast <uint64_t>(((ub) - (lb)) / (st) + 1); } else { chunk.start = static_cast<uint64_t>(ub); chunk.iterations = static_cast <uint64_t>(((lb) - (ub)) / -(st) + 1); } } while (0); instance .ptr = &chunk; ompt_callbacks.ompt_callback_dispatch_callback ( &(team_info->parallel_data), &(task_info->task_data ), ompt_dispatch_ws_loop_chunk, instance); } \ | |||
1968 | if (ompt_enabled.ompt_callback_dispatch && status) { \ | |||
1969 | ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL__null); \ | |||
1970 | ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ | |||
1971 | ompt_dispatch_chunk_t chunk; \ | |||
1972 | ompt_data_t instance = ompt_data_none{0}; \ | |||
1973 | OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st)do { if (st > 0) { chunk.start = static_cast<uint64_t> (lb); chunk.iterations = static_cast<uint64_t>(((ub) - ( lb)) / (st) + 1); } else { chunk.start = static_cast<uint64_t >(ub); chunk.iterations = static_cast<uint64_t>(((lb ) - (ub)) / -(st) + 1); } } while (0); \ | |||
1974 | instance.ptr = &chunk; \ | |||
1975 | ompt_callbacks.ompt_callback(ompt_callback_dispatch)ompt_callback_dispatch_callback( \ | |||
1976 | &(team_info->parallel_data), &(task_info->task_data), \ | |||
1977 | ompt_dispatch_ws_loop_chunk, instance); \ | |||
1978 | } | |||
1979 | // TODO: implement count | |||
1980 | #else | |||
1981 | #define OMPT_LOOP_ENDif (status == 0) { if (ompt_enabled.ompt_callback_work) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_callbacks. ompt_callback_work_callback( ompt_work_loop, ompt_scope_end, & (team_info->parallel_data), &(task_info->task_data) , 0, codeptr); } } // no-op | |||
1982 | #define OMPT_LOOP_DISPATCH(lb, ub, st, status)if (ompt_enabled.ompt_callback_dispatch && status) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_dispatch_chunk_t chunk; ompt_data_t instance = {0}; do { if (st > 0) { chunk .start = static_cast<uint64_t>(lb); chunk.iterations = static_cast <uint64_t>(((ub) - (lb)) / (st) + 1); } else { chunk.start = static_cast<uint64_t>(ub); chunk.iterations = static_cast <uint64_t>(((lb) - (ub)) / -(st) + 1); } } while (0); instance .ptr = &chunk; ompt_callbacks.ompt_callback_dispatch_callback ( &(team_info->parallel_data), &(task_info->task_data ), ompt_dispatch_ws_loop_chunk, instance); } // no-op | |||
1983 | #endif | |||
1984 | ||||
1985 | #if KMP_STATS_ENABLED0 | |||
1986 | #define KMP_STATS_LOOP_END \ | |||
1987 | { \ | |||
1988 | kmp_int64 u, l, t, i; \ | |||
1989 | l = (kmp_int64)(*p_lb); \ | |||
1990 | u = (kmp_int64)(*p_ub); \ | |||
1991 | i = (kmp_int64)(pr->u.p.st); \ | |||
1992 | if (status == 0) { \ | |||
1993 | t = 0; \ | |||
1994 | KMP_POP_PARTITIONED_TIMER()((void)0); \ | |||
1995 | } else if (i == 1) { \ | |||
1996 | if (u >= l) \ | |||
1997 | t = u - l + 1; \ | |||
1998 | else \ | |||
1999 | t = 0; \ | |||
2000 | } else if (i < 0) { \ | |||
2001 | if (l >= u) \ | |||
2002 | t = (l - u) / (-i) + 1; \ | |||
2003 | else \ | |||
2004 | t = 0; \ | |||
2005 | } else { \ | |||
2006 | if (u >= l) \ | |||
2007 | t = (u - l) / i + 1; \ | |||
2008 | else \ | |||
2009 | t = 0; \ | |||
2010 | } \ | |||
2011 | KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t)((void)0); \ | |||
2012 | } | |||
2013 | #else | |||
2014 | #define KMP_STATS_LOOP_END /* Nothing */ | |||
2015 | #endif | |||
2016 | ||||
2017 | template <typename T> | |||
2018 | static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, | |||
2019 | T *p_lb, T *p_ub, | |||
2020 | typename traits_t<T>::signed_t *p_st | |||
2021 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2022 | , | |||
2023 | void *codeptr | |||
2024 | #endif | |||
2025 | ) { | |||
2026 | ||||
2027 | typedef typename traits_t<T>::unsigned_t UT; | |||
2028 | typedef typename traits_t<T>::signed_t ST; | |||
2029 | // This is potentially slightly misleading, schedule(runtime) will appear here | |||
2030 | // even if the actual runtime schedule is static. (Which points out a | |||
2031 | // disadvantage of schedule(runtime): even when static scheduling is used it | |||
2032 | // costs more than a compile time choice to use static scheduling would.) | |||
2033 | KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling)((void)0); | |||
2034 | ||||
2035 | int status; | |||
2036 | dispatch_private_info_template<T> *pr; | |||
2037 | __kmp_assert_valid_gtid(gtid); | |||
2038 | kmp_info_t *th = __kmp_threads[gtid]; | |||
2039 | kmp_team_t *team = th->th.th_team; | |||
2040 | ||||
2041 | KMP_DEBUG_ASSERT(p_lb && p_ub && p_st)if (!(p_lb && p_ub && p_st)) { __kmp_debug_assert ("p_lb && p_ub && p_st", "openmp/runtime/src/kmp_dispatch.cpp" , 2041); }; // AC: these cannot be NULL | |||
2042 | KD_TRACE(if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n" , gtid, p_lb, p_ub, p_st, p_last); } | |||
2043 | 1000,if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n" , gtid, p_lb, p_ub, p_st, p_last); } | |||
2044 | ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n" , gtid, p_lb, p_ub, p_st, p_last); } | |||
2045 | gtid, p_lb, p_ub, p_st, p_last))if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n" , gtid, p_lb, p_ub, p_st, p_last); }; | |||
2046 | ||||
2047 | if (team->t.t_serialized) { | |||
2048 | /* NOTE: serialize this dispatch because we are not at the active level */ | |||
2049 | pr = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
2050 | th->th.th_dispatch->th_disp_buffer); /* top of the stack */ | |||
2051 | KMP_DEBUG_ASSERT(pr)if (!(pr)) { __kmp_debug_assert("pr", "openmp/runtime/src/kmp_dispatch.cpp" , 2051); }; | |||
2052 | ||||
2053 | if ((status = (pr->u.p.tc != 0)) == 0) { | |||
2054 | *p_lb = 0; | |||
2055 | *p_ub = 0; | |||
2056 | // if ( p_last != NULL ) | |||
2057 | // *p_last = 0; | |||
2058 | if (p_st != NULL__null) | |||
2059 | *p_st = 0; | |||
2060 | if (__kmp_env_consistency_check) { | |||
2061 | if (pr->pushed_ws != ct_none) { | |||
2062 | pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); | |||
2063 | } | |||
2064 | } | |||
2065 | } else if (pr->flags.nomerge) { | |||
2066 | kmp_int32 last; | |||
2067 | T start; | |||
2068 | UT limit, trip, init; | |||
2069 | ST incr; | |||
2070 | T chunk = pr->u.p.parm1; | |||
2071 | ||||
2072 | KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n" , gtid); } | |||
2073 | gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n" , gtid); }; | |||
2074 | ||||
2075 | init = chunk * pr->u.p.count++; | |||
2076 | trip = pr->u.p.tc - 1; | |||
2077 | ||||
2078 | if ((status = (init <= trip)) == 0) { | |||
2079 | *p_lb = 0; | |||
2080 | *p_ub = 0; | |||
2081 | // if ( p_last != NULL ) | |||
2082 | // *p_last = 0; | |||
2083 | if (p_st != NULL__null) | |||
2084 | *p_st = 0; | |||
2085 | if (__kmp_env_consistency_check) { | |||
2086 | if (pr->pushed_ws != ct_none) { | |||
2087 | pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); | |||
2088 | } | |||
2089 | } | |||
2090 | } else { | |||
2091 | start = pr->u.p.lb; | |||
2092 | limit = chunk + init - 1; | |||
2093 | incr = pr->u.p.st; | |||
2094 | ||||
2095 | if ((last = (limit >= trip)) != 0) { | |||
2096 | limit = trip; | |||
2097 | #if KMP_OS_WINDOWS0 | |||
2098 | pr->u.p.last_upper = pr->u.p.ub; | |||
2099 | #endif /* KMP_OS_WINDOWS */ | |||
2100 | } | |||
2101 | if (p_last != NULL__null) | |||
2102 | *p_last = last; | |||
2103 | if (p_st != NULL__null) | |||
2104 | *p_st = incr; | |||
2105 | if (incr == 1) { | |||
2106 | *p_lb = start + init; | |||
2107 | *p_ub = start + limit; | |||
2108 | } else { | |||
2109 | *p_lb = start + init * incr; | |||
2110 | *p_ub = start + limit * incr; | |||
2111 | } | |||
2112 | ||||
2113 | if (pr->flags.ordered) { | |||
2114 | pr->u.p.ordered_lower = init; | |||
2115 | pr->u.p.ordered_upper = limit; | |||
2116 | #ifdef KMP_DEBUG1 | |||
2117 | { | |||
2118 | char *buff; | |||
2119 | // create format specifiers before the debug output | |||
2120 | buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " | |||
2121 | "ordered_lower:%%%s ordered_upper:%%%s\n", | |||
2122 | traits_t<UT>::spec, traits_t<UT>::spec); | |||
2123 | KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper); } | |||
2124 | pr->u.p.ordered_upper))if (kmp_d_debug >= 1000) { __kmp_debug_printf (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper); }; | |||
2125 | __kmp_str_free(&buff); | |||
2126 | } | |||
2127 | #endif | |||
2128 | } // if | |||
2129 | } // if | |||
2130 | } else { | |||
2131 | pr->u.p.tc = 0; | |||
2132 | *p_lb = pr->u.p.lb; | |||
2133 | *p_ub = pr->u.p.ub; | |||
2134 | #if KMP_OS_WINDOWS0 | |||
2135 | pr->u.p.last_upper = *p_ub; | |||
2136 | #endif /* KMP_OS_WINDOWS */ | |||
2137 | if (p_last != NULL__null) | |||
2138 | *p_last = TRUE(!0); | |||
2139 | if (p_st != NULL__null) | |||
2140 | *p_st = pr->u.p.st; | |||
2141 | } // if | |||
2142 | #ifdef KMP_DEBUG1 | |||
2143 | { | |||
2144 | char *buff; | |||
2145 | // create format specifiers before the debug output | |||
2146 | buff = __kmp_str_format( | |||
2147 | "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " | |||
2148 | "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n", | |||
2149 | traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); | |||
2150 | KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, * p_lb, *p_ub, *p_st, p_last, (p_last ? *p_last : 0), status); } | |||
2151 | (p_last ? *p_last : 0), status))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, * p_lb, *p_ub, *p_st, p_last, (p_last ? *p_last : 0), status); }; | |||
2152 | __kmp_str_free(&buff); | |||
2153 | } | |||
2154 | #endif | |||
2155 | #if INCLUDE_SSC_MARKS(1 && 1) | |||
2156 | SSC_MARK_DISPATCH_NEXT()__asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0xd697) : "%ebx"); | |||
2157 | #endif | |||
2158 | OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status)if (ompt_enabled.ompt_callback_dispatch && status) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_dispatch_chunk_t chunk; ompt_data_t instance = {0}; do { if (pr->u.p.st > 0) { chunk.start = static_cast<uint64_t>(*p_lb); chunk .iterations = static_cast<uint64_t>(((*p_ub) - (*p_lb)) / (pr->u.p.st) + 1); } else { chunk.start = static_cast< uint64_t>(*p_ub); chunk.iterations = static_cast<uint64_t >(((*p_lb) - (*p_ub)) / -(pr->u.p.st) + 1); } } while ( 0); instance.ptr = &chunk; ompt_callbacks.ompt_callback_dispatch_callback ( &(team_info->parallel_data), &(task_info->task_data ), ompt_dispatch_ws_loop_chunk, instance); }; | |||
2159 | OMPT_LOOP_ENDif (status == 0) { if (ompt_enabled.ompt_callback_work) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_callbacks. ompt_callback_work_callback( ompt_work_loop, ompt_scope_end, & (team_info->parallel_data), &(task_info->task_data) , 0, codeptr); } }; | |||
2160 | KMP_STATS_LOOP_END; | |||
2161 | return status; | |||
2162 | } else { | |||
2163 | kmp_int32 last = 0; | |||
2164 | dispatch_shared_info_template<T> volatile *sh; | |||
2165 | ||||
2166 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 2167); } | |||
2167 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 2167); }; | |||
2168 | ||||
2169 | pr = reinterpret_cast<dispatch_private_info_template<T> *>( | |||
2170 | th->th.th_dispatch->th_dispatch_pr_current); | |||
2171 | KMP_DEBUG_ASSERT(pr)if (!(pr)) { __kmp_debug_assert("pr", "openmp/runtime/src/kmp_dispatch.cpp" , 2171); }; | |||
2172 | sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( | |||
2173 | th->th.th_dispatch->th_dispatch_sh_current); | |||
2174 | KMP_DEBUG_ASSERT(sh)if (!(sh)) { __kmp_debug_assert("sh", "openmp/runtime/src/kmp_dispatch.cpp" , 2174); }; | |||
2175 | ||||
2176 | #if KMP_USE_HIER_SCHED0 | |||
2177 | if (pr->flags.use_hier) | |||
2178 | status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st); | |||
2179 | else | |||
2180 | #endif // KMP_USE_HIER_SCHED | |||
2181 | status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub, | |||
2182 | p_st, th->th.th_team_nproc, | |||
2183 | th->th.th_info.ds.ds_tid); | |||
2184 | // status == 0: no more iterations to execute | |||
2185 | if (status == 0) { | |||
2186 | ST num_done; | |||
2187 | num_done = test_then_inc<ST>(&sh->u.s.num_done); | |||
2188 | #ifdef KMP_DEBUG1 | |||
2189 | { | |||
2190 | char *buff; | |||
2191 | // create format specifiers before the debug output | |||
2192 | buff = __kmp_str_format( | |||
2193 | "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", | |||
2194 | traits_t<ST>::spec); | |||
2195 | KD_TRACE(10, (buff, gtid, sh->u.s.num_done))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, sh ->u.s.num_done); }; | |||
2196 | __kmp_str_free(&buff); | |||
2197 | } | |||
2198 | #endif | |||
2199 | ||||
2200 | #if KMP_USE_HIER_SCHED0 | |||
2201 | pr->flags.use_hier = FALSE0; | |||
2202 | #endif | |||
2203 | if (num_done == th->th.th_team_nproc - 1) { | |||
2204 | #if KMP_STATIC_STEAL_ENABLED1 | |||
2205 | if (pr->schedule == kmp_sch_static_steal) { | |||
2206 | int i; | |||
2207 | int idx = (th->th.th_dispatch->th_disp_index - 1) % | |||
2208 | __kmp_dispatch_num_buffers; // current loop index | |||
2209 | // loop complete, safe to destroy locks used for stealing | |||
2210 | for (i = 0; i < th->th.th_team_nproc; ++i) { | |||
2211 | dispatch_private_info_template<T> *buf = | |||
2212 | reinterpret_cast<dispatch_private_info_template<T> *>( | |||
2213 | &team->t.t_dispatch[i].th_disp_buffer[idx]); | |||
2214 | KMP_ASSERT(buf->steal_flag == THIEF)if (!(buf->steal_flag == THIEF)) { __kmp_debug_assert("buf->steal_flag == THIEF" , "openmp/runtime/src/kmp_dispatch.cpp", 2214); }; // buffer must be inactive | |||
2215 | KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED)(&buf->steal_flag)->store(UNUSED, std::memory_order_relaxed ); | |||
2216 | if (traits_t<T>::type_size > 4) { | |||
2217 | // destroy locks used for stealing | |||
2218 | kmp_lock_t *lck = buf->u.p.steal_lock; | |||
2219 | KMP_ASSERT(lck != NULL)if (!(lck != __null)) { __kmp_debug_assert("lck != NULL", "openmp/runtime/src/kmp_dispatch.cpp" , 2219); }; | |||
2220 | __kmp_destroy_lock(lck); | |||
2221 | __kmp_free(lck)___kmp_free((lck), "openmp/runtime/src/kmp_dispatch.cpp", 2221 ); | |||
2222 | buf->u.p.steal_lock = NULL__null; | |||
2223 | } | |||
2224 | } | |||
2225 | } | |||
2226 | #endif | |||
2227 | /* NOTE: release shared buffer to be reused */ | |||
2228 | ||||
2229 | KMP_MB(); /* Flush all pending memory write invalidates. */ | |||
2230 | ||||
2231 | sh->u.s.num_done = 0; | |||
2232 | sh->u.s.iteration = 0; | |||
2233 | ||||
2234 | /* TODO replace with general release procedure? */ | |||
2235 | if (pr->flags.ordered) { | |||
2236 | sh->u.s.ordered_iteration = 0; | |||
2237 | } | |||
2238 | ||||
2239 | sh->buffer_index += __kmp_dispatch_num_buffers; | |||
2240 | KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d change buffer_index:%d\n" , gtid, sh->buffer_index); } | |||
2241 | gtid, sh->buffer_index))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d change buffer_index:%d\n" , gtid, sh->buffer_index); }; | |||
2242 | ||||
2243 | KMP_MB(); /* Flush all pending memory write invalidates. */ | |||
2244 | ||||
2245 | } // if | |||
2246 | if (__kmp_env_consistency_check) { | |||
2247 | if (pr->pushed_ws != ct_none) { | |||
2248 | pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc); | |||
2249 | } | |||
2250 | } | |||
2251 | ||||
2252 | th->th.th_dispatch->th_deo_fcn = NULL__null; | |||
2253 | th->th.th_dispatch->th_dxo_fcn = NULL__null; | |||
2254 | th->th.th_dispatch->th_dispatch_sh_current = NULL__null; | |||
2255 | th->th.th_dispatch->th_dispatch_pr_current = NULL__null; | |||
2256 | } // if (status == 0) | |||
2257 | #if KMP_OS_WINDOWS0 | |||
2258 | else if (last) { | |||
2259 | pr->u.p.last_upper = pr->u.p.ub; | |||
2260 | } | |||
2261 | #endif /* KMP_OS_WINDOWS */ | |||
2262 | if (p_last != NULL__null && status != 0) | |||
2263 | *p_last = last; | |||
2264 | } // if | |||
2265 | ||||
2266 | #ifdef KMP_DEBUG1 | |||
2267 | { | |||
2268 | char *buff; | |||
2269 | // create format specifiers before the debug output | |||
2270 | buff = __kmp_str_format( | |||
2271 | "__kmp_dispatch_next: T#%%d normal case: " | |||
2272 | "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n", | |||
2273 | traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec); | |||
2274 | KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, * p_lb, *p_ub, p_st ? *p_st : 0, p_last, (p_last ? *p_last : 0) , status); } | |||
2275 | (p_last ? *p_last : 0), status))if (kmp_d_debug >= 10) { __kmp_debug_printf (buff, gtid, * p_lb, *p_ub, p_st ? *p_st : 0, p_last, (p_last ? *p_last : 0) , status); }; | |||
2276 | __kmp_str_free(&buff); | |||
2277 | } | |||
2278 | #endif | |||
2279 | #if INCLUDE_SSC_MARKS(1 && 1) | |||
2280 | SSC_MARK_DISPATCH_NEXT()__asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0xd697) : "%ebx"); | |||
2281 | #endif | |||
2282 | OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status)if (ompt_enabled.ompt_callback_dispatch && status) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_dispatch_chunk_t chunk; ompt_data_t instance = {0}; do { if (pr->u.p.st > 0) { chunk.start = static_cast<uint64_t>(*p_lb); chunk .iterations = static_cast<uint64_t>(((*p_ub) - (*p_lb)) / (pr->u.p.st) + 1); } else { chunk.start = static_cast< uint64_t>(*p_ub); chunk.iterations = static_cast<uint64_t >(((*p_lb) - (*p_ub)) / -(pr->u.p.st) + 1); } } while ( 0); instance.ptr = &chunk; ompt_callbacks.ompt_callback_dispatch_callback ( &(team_info->parallel_data), &(task_info->task_data ), ompt_dispatch_ws_loop_chunk, instance); }; | |||
2283 | OMPT_LOOP_ENDif (status == 0) { if (ompt_enabled.ompt_callback_work) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, __null); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); ompt_callbacks. ompt_callback_work_callback( ompt_work_loop, ompt_scope_end, & (team_info->parallel_data), &(task_info->task_data) , 0, codeptr); } }; | |||
2284 | KMP_STATS_LOOP_END; | |||
2285 | return status; | |||
2286 | } | |||
2287 | ||||
2288 | /*! | |||
2289 | @ingroup WORK_SHARING | |||
2290 | @param loc source location information | |||
2291 | @param global_tid global thread number | |||
2292 | @return Zero if the parallel region is not active and this thread should execute | |||
2293 | all sections, non-zero otherwise. | |||
2294 | ||||
2295 | Beginning of sections construct. | |||
2296 | There are no implicit barriers in the "sections" calls, rather the compiler | |||
2297 | should introduce an explicit barrier if it is required. | |||
2298 | ||||
2299 | This implementation is based on __kmp_dispatch_init, using same constructs for | |||
2300 | shared data (we can't have sections nested directly in omp for loop, there | |||
2301 | should be a parallel region in between) | |||
2302 | */ | |||
2303 | kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) { | |||
2304 | ||||
2305 | int active; | |||
2306 | kmp_info_t *th; | |||
2307 | kmp_team_t *team; | |||
2308 | kmp_uint32 my_buffer_index; | |||
2309 | dispatch_shared_info_template<kmp_int32> volatile *sh; | |||
2310 | ||||
2311 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2311); }; | |||
2312 | ||||
2313 | if (!TCR_4(__kmp_init_parallel)(__kmp_init_parallel)) | |||
2314 | __kmp_parallel_initialize(); | |||
2315 | __kmp_resume_if_soft_paused(); | |||
2316 | ||||
2317 | /* setup data */ | |||
2318 | th = __kmp_threads[gtid]; | |||
2319 | team = th->th.th_team; | |||
2320 | active = !team->t.t_serialized; | |||
2321 | th->th.th_ident = loc; | |||
2322 | ||||
2323 | KMP_COUNT_BLOCK(OMP_SECTIONS)((void)0); | |||
2324 | KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid))if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmpc_sections: called by T#%d\n" , gtid); }; | |||
2325 | ||||
2326 | if (active) { | |||
2327 | // Setup sections in the same way as dynamic scheduled loops. | |||
2328 | // We need one shared data: which section is to execute next. | |||
2329 | // (in case parallel is not active, all sections will be executed on the | |||
2330 | // same thread) | |||
2331 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 2332); } | |||
2332 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 2332); }; | |||
2333 | ||||
2334 | my_buffer_index = th->th.th_dispatch->th_disp_index++; | |||
2335 | ||||
2336 | // reuse shared data structures from dynamic sched loops: | |||
2337 | sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( | |||
2338 | &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); | |||
2339 | KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmpc_sections_init: T#%d my_buffer_index:%d\n" , gtid, my_buffer_index); } | |||
2340 | my_buffer_index))if (kmp_d_debug >= 10) { __kmp_debug_printf ("__kmpc_sections_init: T#%d my_buffer_index:%d\n" , gtid, my_buffer_index); }; | |||
2341 | ||||
2342 | th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error; | |||
2343 | th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error; | |||
2344 | ||||
2345 | KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
2346 | "sh->buffer_index:%d\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
2347 | gtid, my_buffer_index, sh->buffer_index))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); }; | |||
2348 | __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index, | |||
2349 | __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL), __null); | |||
2350 | // Note: KMP_WAIT() cannot be used there: buffer index and | |||
2351 | // my_buffer_index are *always* 32-bit integers. | |||
2352 | KMP_MB(); | |||
2353 | KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
2354 | "sh->buffer_index:%d\n",if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); } | |||
2355 | gtid, my_buffer_index, sh->buffer_index))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d " "sh->buffer_index:%d\n", gtid, my_buffer_index, sh->buffer_index ); }; | |||
2356 | ||||
2357 | th->th.th_dispatch->th_dispatch_pr_current = | |||
2358 | nullptr; // sections construct doesn't need private data | |||
2359 | th->th.th_dispatch->th_dispatch_sh_current = | |||
2360 | CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh)const_cast<dispatch_shared_info_t *>((volatile dispatch_shared_info_t *)sh); | |||
2361 | } | |||
2362 | ||||
2363 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2364 | if (ompt_enabled.ompt_callback_work) { | |||
2365 | ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL__null); | |||
2366 | ompt_task_info_t *task_info = __ompt_get_task_info_object(0); | |||
2367 | ompt_callbacks.ompt_callback(ompt_callback_work)ompt_callback_work_callback( | |||
2368 | ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data), | |||
2369 | &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)__builtin_return_address(0)); | |||
2370 | } | |||
2371 | #endif | |||
2372 | KMP_PUSH_PARTITIONED_TIMER(OMP_sections)((void)0); | |||
2373 | ||||
2374 | return active; | |||
2375 | } | |||
2376 | ||||
2377 | /*! | |||
2378 | @ingroup WORK_SHARING | |||
2379 | @param loc source location information | |||
2380 | @param global_tid global thread number | |||
2381 | @param numberOfSections number of sections in the 'sections' construct | |||
2382 | @return unsigned [from 0 to n) - number (id) of the section to execute next on | |||
2383 | this thread. n (or any other number not in range) - nothing to execute on this | |||
2384 | thread | |||
2385 | */ | |||
2386 | ||||
2387 | kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, | |||
2388 | kmp_int32 numberOfSections) { | |||
2389 | ||||
2390 | KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead)((void)0); | |||
2391 | ||||
2392 | kmp_info_t *th = __kmp_threads[gtid]; | |||
2393 | #ifdef KMP_DEBUG1 | |||
2394 | kmp_team_t *team = th->th.th_team; | |||
2395 | #endif | |||
2396 | ||||
2397 | KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d; number of sections:%d\n" , gtid, numberOfSections); } | |||
2398 | numberOfSections))if (kmp_d_debug >= 1000) { __kmp_debug_printf ("__kmp_dispatch_next: T#%d; number of sections:%d\n" , gtid, numberOfSections); }; | |||
2399 | ||||
2400 | // For serialized case we should not call this function: | |||
2401 | KMP_DEBUG_ASSERT(!team->t.t_serialized)if (!(!team->t.t_serialized)) { __kmp_debug_assert("!team->t.t_serialized" , "openmp/runtime/src/kmp_dispatch.cpp", 2401); }; | |||
2402 | ||||
2403 | dispatch_shared_info_template<kmp_int32> volatile *sh; | |||
2404 | ||||
2405 | KMP_DEBUG_ASSERT(th->th.th_dispatch ==if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 2406); } | |||
2406 | &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid])if (!(th->th.th_dispatch == &th->th.th_team->t.t_dispatch [th->th.th_info.ds.ds_tid])) { __kmp_debug_assert("th->th.th_dispatch == &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]" , "openmp/runtime/src/kmp_dispatch.cpp", 2406); }; | |||
2407 | ||||
2408 | KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current))if (!(!(th->th.th_dispatch->th_dispatch_pr_current))) { __kmp_debug_assert("!(th->th.th_dispatch->th_dispatch_pr_current)" , "openmp/runtime/src/kmp_dispatch.cpp", 2408); }; | |||
2409 | sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>( | |||
2410 | th->th.th_dispatch->th_dispatch_sh_current); | |||
2411 | KMP_DEBUG_ASSERT(sh)if (!(sh)) { __kmp_debug_assert("sh", "openmp/runtime/src/kmp_dispatch.cpp" , 2411); }; | |||
2412 | ||||
2413 | kmp_int32 sectionIndex = 0; | |||
2414 | bool moreSectionsToExecute = true; | |||
2415 | ||||
2416 | // Find section to execute: | |||
2417 | sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration); | |||
2418 | if (sectionIndex >= numberOfSections) { | |||
2419 | moreSectionsToExecute = false; | |||
2420 | } | |||
2421 | ||||
2422 | // status == 0: no more sections to execute; | |||
2423 | // OMPTODO: __kmpc_end_sections could be bypassed? | |||
2424 | if (!moreSectionsToExecute) { | |||
2425 | kmp_int32 num_done; | |||
2426 | ||||
2427 | num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done)); | |||
2428 | ||||
2429 | if (num_done == th->th.th_team_nproc - 1) { | |||
2430 | /* NOTE: release this buffer to be reused */ | |||
2431 | ||||
2432 | KMP_MB(); /* Flush all pending memory write invalidates. */ | |||
2433 | ||||
2434 | sh->u.s.num_done = 0; | |||
2435 | sh->u.s.iteration = 0; | |||
2436 | ||||
2437 | KMP_MB(); /* Flush all pending memory write invalidates. */ | |||
2438 | ||||
2439 | sh->buffer_index += __kmp_dispatch_num_buffers; | |||
2440 | KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_next_section: T#%d change buffer_index:%d\n" , gtid, sh->buffer_index); } | |||
2441 | sh->buffer_index))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_next_section: T#%d change buffer_index:%d\n" , gtid, sh->buffer_index); }; | |||
2442 | ||||
2443 | KMP_MB(); /* Flush all pending memory write invalidates. */ | |||
2444 | ||||
2445 | } // if | |||
2446 | ||||
2447 | th->th.th_dispatch->th_deo_fcn = NULL__null; | |||
2448 | th->th.th_dispatch->th_dxo_fcn = NULL__null; | |||
2449 | th->th.th_dispatch->th_dispatch_sh_current = NULL__null; | |||
2450 | th->th.th_dispatch->th_dispatch_pr_current = NULL__null; | |||
2451 | ||||
2452 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2453 | if (ompt_enabled.ompt_callback_dispatch) { | |||
2454 | ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL__null); | |||
2455 | ompt_task_info_t *task_info = __ompt_get_task_info_object(0); | |||
2456 | ompt_data_t instance = ompt_data_none{0}; | |||
2457 | instance.ptr = OMPT_GET_RETURN_ADDRESS(0)__builtin_return_address(0); | |||
2458 | ompt_callbacks.ompt_callback(ompt_callback_dispatch)ompt_callback_dispatch_callback( | |||
2459 | &(team_info->parallel_data), &(task_info->task_data), | |||
2460 | ompt_dispatch_section, instance); | |||
2461 | } | |||
2462 | #endif | |||
2463 | } | |||
2464 | ||||
2465 | return sectionIndex; | |||
2466 | } | |||
2467 | ||||
2468 | /*! | |||
2469 | @ingroup WORK_SHARING | |||
2470 | @param loc source location information | |||
2471 | @param global_tid global thread number | |||
2472 | ||||
2473 | End of "sections" construct. | |||
2474 | Don't need to wait here: barrier is added separately when needed. | |||
2475 | */ | |||
2476 | void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) { | |||
2477 | ||||
2478 | kmp_info_t *th = __kmp_threads[gtid]; | |||
2479 | int active = !th->th.th_team->t.t_serialized; | |||
2480 | ||||
2481 | KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_end_sections: T#%d called\n" , gtid); }; | |||
2482 | ||||
2483 | if (!active) { | |||
2484 | // In active case call finalization is done in __kmpc_next_section | |||
2485 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2486 | if (ompt_enabled.ompt_callback_work) { | |||
2487 | ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL__null); | |||
2488 | ompt_task_info_t *task_info = __ompt_get_task_info_object(0); | |||
2489 | ompt_callbacks.ompt_callback(ompt_callback_work)ompt_callback_work_callback( | |||
2490 | ompt_work_sections, ompt_scope_end, &(team_info->parallel_data), | |||
2491 | &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0)__builtin_return_address(0)); | |||
2492 | } | |||
2493 | #endif | |||
2494 | } | |||
2495 | ||||
2496 | KMP_POP_PARTITIONED_TIMER()((void)0); | |||
2497 | KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid))if (kmp_d_debug >= 100) { __kmp_debug_printf ("__kmpc_end_sections: T#%d returned\n" , gtid); }; | |||
2498 | } | |||
2499 | ||||
2500 | template <typename T> | |||
2501 | static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, | |||
2502 | kmp_int32 *plastiter, T *plower, T *pupper, | |||
2503 | typename traits_t<T>::signed_t incr) { | |||
2504 | typedef typename traits_t<T>::unsigned_t UT; | |||
2505 | kmp_uint32 team_id; | |||
2506 | kmp_uint32 nteams; | |||
2507 | UT trip_count; | |||
2508 | kmp_team_t *team; | |||
2509 | kmp_info_t *th; | |||
2510 | ||||
2511 | KMP_DEBUG_ASSERT(plastiter && plower && pupper)if (!(plastiter && plower && pupper)) { __kmp_debug_assert ("plastiter && plower && pupper", "openmp/runtime/src/kmp_dispatch.cpp" , 2511); }; | |||
2512 | KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid))if (kmp_e_debug >= 10) { __kmp_debug_printf ("__kmpc_dist_get_bounds called (%d)\n" , gtid); }; | |||
2513 | #ifdef KMP_DEBUG1 | |||
2514 | typedef typename traits_t<T>::signed_t ST; | |||
2515 | { | |||
2516 | char *buff; | |||
2517 | // create format specifiers before the debug output | |||
2518 | buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d " | |||
2519 | "iter=(%%%s, %%%s, %%%s) signed?<%s>\n", | |||
2520 | traits_t<T>::spec, traits_t<T>::spec, | |||
2521 | traits_t<ST>::spec, traits_t<T>::spec); | |||
2522 | KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr))if (kmp_d_debug >= 100) { __kmp_debug_printf (buff, gtid, * plastiter, *plower, *pupper, incr); }; | |||
2523 | __kmp_str_free(&buff); | |||
2524 | } | |||
2525 | #endif | |||
2526 | ||||
2527 | if (__kmp_env_consistency_check) { | |||
2528 | if (incr == 0) { | |||
2529 | __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, | |||
2530 | loc); | |||
2531 | } | |||
2532 | if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) { | |||
2533 | // The loop is illegal. | |||
2534 | // Some zero-trip loops maintained by compiler, e.g.: | |||
2535 | // for(i=10;i<0;++i) // lower >= upper - run-time check | |||
2536 | // for(i=0;i>10;--i) // lower <= upper - run-time check | |||
2537 | // for(i=0;i>10;++i) // incr > 0 - compile-time check | |||
2538 | // for(i=10;i<0;--i) // incr < 0 - compile-time check | |||
2539 | // Compiler does not check the following illegal loops: | |||
2540 | // for(i=0;i<10;i+=incr) // where incr<0 | |||
2541 | // for(i=10;i>0;i-=incr) // where incr<0 | |||
2542 | __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc); | |||
2543 | } | |||
2544 | } | |||
2545 | __kmp_assert_valid_gtid(gtid); | |||
2546 | th = __kmp_threads[gtid]; | |||
2547 | team = th->th.th_team; | |||
2548 | KMP_DEBUG_ASSERT(th->th.th_teams_microtask)if (!(th->th.th_teams_microtask)) { __kmp_debug_assert("th->th.th_teams_microtask" , "openmp/runtime/src/kmp_dispatch.cpp", 2548); }; // we are in the teams construct | |||
2549 | nteams = th->th.th_teams_size.nteams; | |||
2550 | team_id = team->t.t_master_tid; | |||
2551 | KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc)if (!(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc )) { __kmp_debug_assert("nteams == (kmp_uint32)team->t.t_parent->t.t_nproc" , "openmp/runtime/src/kmp_dispatch.cpp", 2551); }; | |||
2552 | ||||
2553 | // compute global trip count | |||
2554 | if (incr == 1) { | |||
2555 | trip_count = *pupper - *plower + 1; | |||
2556 | } else if (incr == -1) { | |||
2557 | trip_count = *plower - *pupper + 1; | |||
2558 | } else if (incr > 0) { | |||
2559 | // upper-lower can exceed the limit of signed type | |||
2560 | trip_count = (UT)(*pupper - *plower) / incr + 1; | |||
2561 | } else { | |||
2562 | trip_count = (UT)(*plower - *pupper) / (-incr) + 1; | |||
2563 | } | |||
2564 | ||||
2565 | if (trip_count <= nteams) { | |||
2566 | KMP_DEBUG_ASSERT(if (!(__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced)) { __kmp_debug_assert("__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced" , "openmp/runtime/src/kmp_dispatch.cpp", 2569); } | |||
2567 | __kmp_static == kmp_sch_static_greedy ||if (!(__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced)) { __kmp_debug_assert("__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced" , "openmp/runtime/src/kmp_dispatch.cpp", 2569); } | |||
2568 | __kmp_static ==if (!(__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced)) { __kmp_debug_assert("__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced" , "openmp/runtime/src/kmp_dispatch.cpp", 2569); } | |||
2569 | kmp_sch_static_balanced)if (!(__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced)) { __kmp_debug_assert("__kmp_static == kmp_sch_static_greedy || __kmp_static == kmp_sch_static_balanced" , "openmp/runtime/src/kmp_dispatch.cpp", 2569); }; // Unknown static scheduling type. | |||
2570 | // only some teams get single iteration, others get nothing | |||
2571 | if (team_id < trip_count) { | |||
2572 | *pupper = *plower = *plower + team_id * incr; | |||
2573 | } else { | |||
2574 | *plower = *pupper + incr; // zero-trip loop | |||
2575 | } | |||
2576 | if (plastiter != NULL__null) | |||
2577 | *plastiter = (team_id == trip_count - 1); | |||
2578 | } else { | |||
2579 | if (__kmp_static == kmp_sch_static_balanced) { | |||
2580 | UT chunk = trip_count / nteams; | |||
2581 | UT extras = trip_count % nteams; | |||
2582 | *plower += | |||
2583 | incr * (team_id * chunk + (team_id < extras ? team_id : extras)); | |||
2584 | *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr); | |||
2585 | if (plastiter != NULL__null) | |||
2586 | *plastiter = (team_id == nteams - 1); | |||
2587 | } else { | |||
2588 | T chunk_inc_count = | |||
2589 | (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr; | |||
2590 | T upper = *pupper; | |||
2591 | KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy)if (!(__kmp_static == kmp_sch_static_greedy)) { __kmp_debug_assert ("__kmp_static == kmp_sch_static_greedy", "openmp/runtime/src/kmp_dispatch.cpp" , 2591); }; | |||
2592 | // Unknown static scheduling type. | |||
2593 | *plower += team_id * chunk_inc_count; | |||
2594 | *pupper = *plower + chunk_inc_count - incr; | |||
2595 | // Check/correct bounds if needed | |||
2596 | if (incr > 0) { | |||
2597 | if (*pupper < *plower) | |||
2598 | *pupper = traits_t<T>::max_value; | |||
2599 | if (plastiter != NULL__null) | |||
2600 | *plastiter = *plower <= upper && *pupper > upper - incr; | |||
2601 | if (*pupper > upper) | |||
2602 | *pupper = upper; // tracker C73258 | |||
2603 | } else { | |||
2604 | if (*pupper > *plower) | |||
2605 | *pupper = traits_t<T>::min_value; | |||
2606 | if (plastiter != NULL__null) | |||
2607 | *plastiter = *plower >= upper && *pupper < upper - incr; | |||
2608 | if (*pupper < upper) | |||
2609 | *pupper = upper; // tracker C73258 | |||
2610 | } | |||
2611 | } | |||
2612 | } | |||
2613 | } | |||
2614 | ||||
2615 | //----------------------------------------------------------------------------- | |||
2616 | // Dispatch routines | |||
2617 | // Transfer call to template< type T > | |||
2618 | // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule, | |||
2619 | // T lb, T ub, ST st, ST chunk ) | |||
2620 | extern "C" { | |||
2621 | ||||
2622 | /*! | |||
2623 | @ingroup WORK_SHARING | |||
2624 | @{ | |||
2625 | @param loc Source location | |||
2626 | @param gtid Global thread id | |||
2627 | @param schedule Schedule type | |||
2628 | @param lb Lower bound | |||
2629 | @param ub Upper bound | |||
2630 | @param st Step (or increment if you prefer) | |||
2631 | @param chunk The chunk size to block with | |||
2632 | ||||
2633 | This function prepares the runtime to start a dynamically scheduled for loop, | |||
2634 | saving the loop arguments. | |||
2635 | These functions are all identical apart from the types of the arguments. | |||
2636 | */ | |||
2637 | ||||
2638 | void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, | |||
2639 | enum sched_type schedule, kmp_int32 lb, | |||
2640 | kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) { | |||
2641 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2641); }; | |||
2642 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2643 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2644 | #endif | |||
2645 | __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2646 | } | |||
2647 | /*! | |||
2648 | See @ref __kmpc_dispatch_init_4 | |||
2649 | */ | |||
2650 | void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, | |||
2651 | enum sched_type schedule, kmp_uint32 lb, | |||
2652 | kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) { | |||
2653 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2653); }; | |||
2654 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2655 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2656 | #endif | |||
2657 | __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2658 | } | |||
2659 | ||||
2660 | /*! | |||
2661 | See @ref __kmpc_dispatch_init_4 | |||
2662 | */ | |||
2663 | void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, | |||
2664 | enum sched_type schedule, kmp_int64 lb, | |||
2665 | kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) { | |||
2666 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2666); }; | |||
2667 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2668 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2669 | #endif | |||
2670 | __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2671 | } | |||
2672 | ||||
2673 | /*! | |||
2674 | See @ref __kmpc_dispatch_init_4 | |||
2675 | */ | |||
2676 | void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, | |||
2677 | enum sched_type schedule, kmp_uint64 lb, | |||
2678 | kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) { | |||
2679 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2679); }; | |||
2680 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2681 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2682 | #endif | |||
2683 | __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2684 | } | |||
2685 | ||||
2686 | /*! | |||
2687 | See @ref __kmpc_dispatch_init_4 | |||
2688 | ||||
2689 | Difference from __kmpc_dispatch_init set of functions is these functions | |||
2690 | are called for composite distribute parallel for construct. Thus before | |||
2691 | regular iterations dispatching we need to calc per-team iteration space. | |||
2692 | ||||
2693 | These functions are all identical apart from the types of the arguments. | |||
2694 | */ | |||
2695 | void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, | |||
2696 | enum sched_type schedule, kmp_int32 *p_last, | |||
2697 | kmp_int32 lb, kmp_int32 ub, kmp_int32 st, | |||
2698 | kmp_int32 chunk) { | |||
2699 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2699); }; | |||
2700 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2701 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2702 | #endif | |||
2703 | __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st); | |||
2704 | __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2705 | } | |||
2706 | ||||
2707 | void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, | |||
2708 | enum sched_type schedule, kmp_int32 *p_last, | |||
2709 | kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, | |||
2710 | kmp_int32 chunk) { | |||
2711 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2711); }; | |||
2712 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2713 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2714 | #endif | |||
2715 | __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st); | |||
2716 | __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2717 | } | |||
2718 | ||||
2719 | void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, | |||
2720 | enum sched_type schedule, kmp_int32 *p_last, | |||
2721 | kmp_int64 lb, kmp_int64 ub, kmp_int64 st, | |||
2722 | kmp_int64 chunk) { | |||
2723 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2723); }; | |||
2724 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2725 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2726 | #endif | |||
2727 | __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st); | |||
2728 | __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2729 | } | |||
2730 | ||||
2731 | void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, | |||
2732 | enum sched_type schedule, kmp_int32 *p_last, | |||
2733 | kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, | |||
2734 | kmp_int64 chunk) { | |||
2735 | KMP_DEBUG_ASSERT(__kmp_init_serial)if (!(__kmp_init_serial)) { __kmp_debug_assert("__kmp_init_serial" , "openmp/runtime/src/kmp_dispatch.cpp", 2735); }; | |||
2736 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2737 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2738 | #endif | |||
2739 | __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st); | |||
2740 | __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true); | |||
2741 | } | |||
2742 | ||||
2743 | /*! | |||
2744 | @param loc Source code location | |||
2745 | @param gtid Global thread id | |||
2746 | @param p_last Pointer to a flag set to one if this is the last chunk or zero | |||
2747 | otherwise | |||
2748 | @param p_lb Pointer to the lower bound for the next chunk of work | |||
2749 | @param p_ub Pointer to the upper bound for the next chunk of work | |||
2750 | @param p_st Pointer to the stride for the next chunk of work | |||
2751 | @return one if there is work to be done, zero otherwise | |||
2752 | ||||
2753 | Get the next dynamically allocated chunk of work for this thread. | |||
2754 | If there is no more work, then the lb,ub and stride need not be modified. | |||
2755 | */ | |||
2756 | int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, | |||
2757 | kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) { | |||
2758 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2759 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2760 | #endif | |||
2761 | return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st | |||
2762 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2763 | , | |||
2764 | OMPT_LOAD_RETURN_ADDRESS(gtid)__ompt_load_return_address(gtid) | |||
2765 | #endif | |||
2766 | ); | |||
2767 | } | |||
2768 | ||||
2769 | /*! | |||
2770 | See @ref __kmpc_dispatch_next_4 | |||
2771 | */ | |||
2772 | int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, | |||
2773 | kmp_uint32 *p_lb, kmp_uint32 *p_ub, | |||
2774 | kmp_int32 *p_st) { | |||
2775 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2776 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2777 | #endif | |||
2778 | return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st | |||
2779 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2780 | , | |||
2781 | OMPT_LOAD_RETURN_ADDRESS(gtid)__ompt_load_return_address(gtid) | |||
2782 | #endif | |||
2783 | ); | |||
2784 | } | |||
2785 | ||||
2786 | /*! | |||
2787 | See @ref __kmpc_dispatch_next_4 | |||
2788 | */ | |||
2789 | int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, | |||
2790 | kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) { | |||
2791 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2792 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2793 | #endif | |||
2794 | return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st | |||
2795 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2796 | , | |||
2797 | OMPT_LOAD_RETURN_ADDRESS(gtid)__ompt_load_return_address(gtid) | |||
2798 | #endif | |||
2799 | ); | |||
2800 | } | |||
2801 | ||||
2802 | /*! | |||
2803 | See @ref __kmpc_dispatch_next_4 | |||
2804 | */ | |||
2805 | int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, | |||
2806 | kmp_uint64 *p_lb, kmp_uint64 *p_ub, | |||
2807 | kmp_int64 *p_st) { | |||
2808 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2809 | OMPT_STORE_RETURN_ADDRESS(gtid)OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address (0)};; | |||
2810 | #endif | |||
2811 | return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st | |||
2812 | #if OMPT_SUPPORT1 && OMPT_OPTIONAL1 | |||
2813 | , | |||
2814 | OMPT_LOAD_RETURN_ADDRESS(gtid)__ompt_load_return_address(gtid) | |||
2815 | #endif | |||
2816 | ); | |||
2817 | } | |||
2818 | ||||
2819 | /*! | |||
2820 | @param loc Source code location | |||
2821 | @param gtid Global thread id | |||
2822 | ||||
2823 | Mark the end of a dynamic loop. | |||
2824 | */ | |||
2825 | void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) { | |||
2826 | __kmp_dispatch_finish<kmp_uint32>(gtid, loc); | |||
2827 | } | |||
2828 | ||||
2829 | /*! | |||
2830 | See @ref __kmpc_dispatch_fini_4 | |||
2831 | */ | |||
2832 | void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) { | |||
2833 | __kmp_dispatch_finish<kmp_uint64>(gtid, loc); | |||
2834 | } | |||
2835 | ||||
2836 | /*! | |||
2837 | See @ref __kmpc_dispatch_fini_4 | |||
2838 | */ | |||
2839 | void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) { | |||
2840 | __kmp_dispatch_finish<kmp_uint32>(gtid, loc); | |||
2841 | } | |||
2842 | ||||
2843 | /*! | |||
2844 | See @ref __kmpc_dispatch_fini_4 | |||
2845 | */ | |||
2846 | void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) { | |||
2847 | __kmp_dispatch_finish<kmp_uint64>(gtid, loc); | |||
2848 | } | |||
2849 | /*! @} */ | |||
2850 | ||||
2851 | //----------------------------------------------------------------------------- | |||
2852 | // Non-template routines from kmp_dispatch.cpp used in other sources | |||
2853 | ||||
2854 | kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) { | |||
2855 | return value == checker; | |||
2856 | } | |||
2857 | ||||
2858 | kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) { | |||
2859 | return value != checker; | |||
2860 | } | |||
2861 | ||||
2862 | kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) { | |||
2863 | return value < checker; | |||
2864 | } | |||
2865 | ||||
2866 | kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) { | |||
2867 | return value >= checker; | |||
2868 | } | |||
2869 | ||||
2870 | kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) { | |||
2871 | return value <= checker; | |||
2872 | } | |||
2873 | ||||
2874 | kmp_uint32 | |||
2875 | __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, | |||
2876 | kmp_uint32 (*pred)(kmp_uint32, kmp_uint32), | |||
2877 | void *obj // Higher-level synchronization object, or NULL. | |||
2878 | ) { | |||
2879 | // note: we may not belong to a team at this point | |||
2880 | volatile kmp_uint32 *spin = spinner; | |||
2881 | kmp_uint32 check = checker; | |||
2882 | kmp_uint32 spins; | |||
2883 | kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred; | |||
2884 | kmp_uint32 r; | |||
2885 | kmp_uint64 time; | |||
2886 | ||||
2887 | KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin))int sync_iters = 0; if (__kmp_itt_fsync_prepare_ptr__3_0) { if (obj == __null) { obj = const_cast<kmp_uint32 *>(spin) ; } } __asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0x4376) : "%ebx"); | |||
2888 | KMP_INIT_YIELD(spins){ (spins) = __kmp_yield_init; }; | |||
2889 | KMP_INIT_BACKOFF(time){ (time) = __kmp_pause_init; }; | |||
2890 | // main wait spin loop | |||
2891 | while (!f(r = TCR_4(*spin)(*spin), check)) { | |||
2892 | KMP_FSYNC_SPIN_PREPARE(obj)do { if (__kmp_itt_fsync_prepare_ptr__3_0 && sync_iters < __kmp_itt_prepare_delay) { ++sync_iters; if (sync_iters >= __kmp_itt_prepare_delay) { (!__kmp_itt_fsync_prepare_ptr__3_0 ) ? (void)0 : __kmp_itt_fsync_prepare_ptr__3_0((void *)((void *)obj)); } } } while (0); | |||
2893 | /* GEH - remove this since it was accidentally introduced when kmp_wait was | |||
2894 | split. It causes problems with infinite recursion because of exit lock */ | |||
2895 | /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) | |||
2896 | __kmp_abort_thread(); */ | |||
2897 | KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time){ if (__kmp_tpause_enabled) { if (((__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))) { __kmp_tpause(0, (time) ); } else { __kmp_tpause(__kmp_tpause_hint, (time)); } (time) = (time << 1 | 1) & ((kmp_uint64)0xFFFF); } else { __kmp_x86_pause(); if ((((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (((__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)))))) { __kmp_yield(); } else if (__kmp_use_yield == 1) { (spins) -= 2; if (!(spins)) { __kmp_yield(); (spins) = __kmp_yield_next; } } } }; | |||
2898 | } | |||
2899 | KMP_FSYNC_SPIN_ACQUIRED(obj)do { __asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0x4377) : "%ebx"); if (sync_iters >= __kmp_itt_prepare_delay ) { (!__kmp_itt_fsync_acquired_ptr__3_0) ? (void)0 : __kmp_itt_fsync_acquired_ptr__3_0 ((void *)((void *)obj)); } } while (0); | |||
2900 | return r; | |||
2901 | } | |||
2902 | ||||
2903 | void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, | |||
2904 | kmp_uint32 (*pred)(void *, kmp_uint32), | |||
2905 | void *obj // Higher-level synchronization object, or NULL. | |||
2906 | ) { | |||
2907 | // note: we may not belong to a team at this point | |||
2908 | void *spin = spinner; | |||
2909 | kmp_uint32 check = checker; | |||
2910 | kmp_uint32 spins; | |||
2911 | kmp_uint32 (*f)(void *, kmp_uint32) = pred; | |||
2912 | kmp_uint64 time; | |||
2913 | ||||
2914 | KMP_FSYNC_SPIN_INIT(obj, spin)int sync_iters = 0; if (__kmp_itt_fsync_prepare_ptr__3_0) { if (obj == __null) { obj = spin; } } __asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0x4376) : "%ebx"); | |||
2915 | KMP_INIT_YIELD(spins){ (spins) = __kmp_yield_init; }; | |||
2916 | KMP_INIT_BACKOFF(time){ (time) = __kmp_pause_init; }; | |||
2917 | // main wait spin loop | |||
2918 | while (!f(spin, check)) { | |||
2919 | KMP_FSYNC_SPIN_PREPARE(obj)do { if (__kmp_itt_fsync_prepare_ptr__3_0 && sync_iters < __kmp_itt_prepare_delay) { ++sync_iters; if (sync_iters >= __kmp_itt_prepare_delay) { (!__kmp_itt_fsync_prepare_ptr__3_0 ) ? (void)0 : __kmp_itt_fsync_prepare_ptr__3_0((void *)((void *)obj)); } } } while (0); | |||
2920 | /* if we have waited a bit, or are noversubscribed, yield */ | |||
2921 | /* pause is in the following code */ | |||
2922 | KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time){ if (__kmp_tpause_enabled) { if (((__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))) { __kmp_tpause(0, (time) ); } else { __kmp_tpause(__kmp_tpause_hint, (time)); } (time) = (time << 1 | 1) & ((kmp_uint64)0xFFFF); } else { __kmp_x86_pause(); if ((((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (((__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)))))) { __kmp_yield(); } else if (__kmp_use_yield == 1) { (spins) -= 2; if (!(spins)) { __kmp_yield(); (spins) = __kmp_yield_next; } } } }; | |||
2923 | } | |||
2924 | KMP_FSYNC_SPIN_ACQUIRED(obj)do { __asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(0x4377) : "%ebx"); if (sync_iters >= __kmp_itt_prepare_delay ) { (!__kmp_itt_fsync_acquired_ptr__3_0) ? (void)0 : __kmp_itt_fsync_acquired_ptr__3_0 ((void *)((void *)obj)); } } while (0); | |||
2925 | } | |||
2926 | ||||
2927 | } // extern "C" | |||
2928 | ||||
2929 | #ifdef KMP_GOMP_COMPAT | |||
2930 | ||||
2931 | void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid, | |||
2932 | enum sched_type schedule, kmp_int32 lb, | |||
2933 | kmp_int32 ub, kmp_int32 st, kmp_int32 chunk, | |||
2934 | int push_ws) { | |||
2935 | __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, | |||
2936 | push_ws); | |||
2937 | } | |||
2938 | ||||
2939 | void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, | |||
2940 | enum sched_type schedule, kmp_uint32 lb, | |||
2941 | kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk, | |||
2942 | int push_ws) { | |||
2943 | __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, | |||
2944 | push_ws); | |||
2945 | } | |||
2946 | ||||
2947 | void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid, | |||
2948 | enum sched_type schedule, kmp_int64 lb, | |||
2949 | kmp_int64 ub, kmp_int64 st, kmp_int64 chunk, | |||
2950 | int push_ws) { | |||
2951 | __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, | |||
2952 | push_ws); | |||
2953 | } | |||
2954 | ||||
2955 | void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, | |||
2956 | enum sched_type schedule, kmp_uint64 lb, | |||
2957 | kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk, | |||
2958 | int push_ws) { | |||
2959 | __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, | |||
2960 | push_ws); | |||
2961 | } | |||
2962 | ||||
2963 | void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) { | |||
2964 | __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); | |||
2965 | } | |||
2966 | ||||
2967 | void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) { | |||
2968 | __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); | |||
2969 | } | |||
2970 | ||||
2971 | void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) { | |||
2972 | __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc); | |||
2973 | } | |||
2974 | ||||
2975 | void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) { | |||
2976 | __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc); | |||
2977 | } | |||
2978 | ||||
2979 | #endif /* KMP_GOMP_COMPAT */ | |||
2980 | ||||
2981 | /* ------------------------------------------------------------------------ */ |