1/*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#define __KMP_IMP
14#include "omp.h" /* extern "C" declarations of user-visible routines */
15#include "kmp.h"
16#include "kmp_error.h"
17#include "kmp_i18n.h"
18#include "kmp_itt.h"
19#include "kmp_lock.h"
20#include "kmp_stats.h"
21#include "ompt-specific.h"
22
23#define MAX_MESSAGE 512
24
25// flags will be used in future, e.g. to implement openmp_strict library
26// restrictions
27
28/*!
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc   in   source location information
31 * @param flags in   for future use (currently ignored)
32 *
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
35 */
36void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37  // By default __kmpc_begin() is no-op.
38  char *env;
39  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40      __kmp_str_match_true(env)) {
41    __kmp_middle_initialize();
42    KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43  } else if (__kmp_ignore_mppbeg() == FALSE) {
44    // By default __kmp_ignore_mppbeg() returns TRUE.
45    __kmp_internal_begin();
46    KC_TRACE(10, ("__kmpc_begin: called\n"));
47  }
48}
49
50/*!
51 * @ingroup STARTUP_SHUTDOWN
52 * @param loc source location information
53 *
54 * Shutdown the runtime library. This is also optional, and even if called will
55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56 * zero.
57 */
58void __kmpc_end(ident_t *loc) {
59  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62  // returns FALSE and __kmpc_end() will unregister this root (it can cause
63  // library shut down).
64  if (__kmp_ignore_mppend() == FALSE) {
65    KC_TRACE(10, ("__kmpc_end: called\n"));
66    KA_TRACE(30, ("__kmpc_end\n"));
67
68    __kmp_internal_end_thread(-1);
69  }
70#if KMP_OS_WINDOWS && OMPT_SUPPORT
71  // Normal exit process on Windows does not allow worker threads of the final
72  // parallel region to finish reporting their events, so shutting down the
73  // library here fixes the issue at least for the cases where __kmpc_end() is
74  // placed properly.
75  if (ompt_enabled.enabled)
76    __kmp_internal_end_library(__kmp_gtid_get_specific());
77#endif
78}
79
80/*!
81@ingroup THREAD_STATES
82@param loc Source location information.
83@return The global thread index of the active thread.
84
85This function can be called in any context.
86
87If the runtime has ony been entered at the outermost level from a
88single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89that which would be returned by omp_get_thread_num() in the outermost
90active parallel construct. (Or zero if there is no active parallel
91construct, since the master thread is necessarily thread zero).
92
93If multiple non-OpenMP threads all enter an OpenMP construct then this
94will be a unique thread identifier among all the threads created by
95the OpenMP runtime (but the value cannote be defined in terms of
96OpenMP thread ids returned by omp_get_thread_num()).
97*/
98kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99  kmp_int32 gtid = __kmp_entry_gtid();
100
101  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102
103  return gtid;
104}
105
106/*!
107@ingroup THREAD_STATES
108@param loc Source location information.
109@return The number of threads under control of the OpenMP<sup>*</sup> runtime
110
111This function can be called in any context.
112It returns the total number of threads under the control of the OpenMP runtime.
113That is not a number that can be determined by any OpenMP standard calls, since
114the library may be called from more than one non-OpenMP thread, and this
115reflects the total over all such calls. Similarly the runtime maintains
116underlying threads even when they are not active (since the cost of creating
117and destroying OS threads is high), this call counts all such threads even if
118they are not waiting for work.
119*/
120kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121  KC_TRACE(10,
122           ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123
124  return TCR_4(__kmp_all_nth);
125}
126
127/*!
128@ingroup THREAD_STATES
129@param loc Source location information.
130@return The thread number of the calling thread in the innermost active parallel
131construct.
132*/
133kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135  return __kmp_tid_from_gtid(__kmp_entry_gtid());
136}
137
138/*!
139@ingroup THREAD_STATES
140@param loc Source location information.
141@return The number of threads in the innermost active parallel construct.
142*/
143kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145
146  return __kmp_entry_thread()->th.th_team->t.t_nproc;
147}
148
149/*!
150 * @ingroup DEPRECATED
151 * @param loc location description
152 *
153 * This function need not be called. It always returns TRUE.
154 */
155kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156#ifndef KMP_DEBUG
157
158  return TRUE;
159
160#else
161
162  const char *semi2;
163  const char *semi3;
164  int line_no;
165
166  if (__kmp_par_range == 0) {
167    return TRUE;
168  }
169  semi2 = loc->psource;
170  if (semi2 == NULL) {
171    return TRUE;
172  }
173  semi2 = strchr(semi2, ';');
174  if (semi2 == NULL) {
175    return TRUE;
176  }
177  semi2 = strchr(semi2 + 1, ';');
178  if (semi2 == NULL) {
179    return TRUE;
180  }
181  if (__kmp_par_range_filename[0]) {
182    const char *name = semi2 - 1;
183    while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184      name--;
185    }
186    if ((*name == '/') || (*name == ';')) {
187      name++;
188    }
189    if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190      return __kmp_par_range < 0;
191    }
192  }
193  semi3 = strchr(semi2 + 1, ';');
194  if (__kmp_par_range_routine[0]) {
195    if ((semi3 != NULL) && (semi3 > semi2) &&
196        (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197      return __kmp_par_range < 0;
198    }
199  }
200  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201    if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202      return __kmp_par_range > 0;
203    }
204    return __kmp_par_range < 0;
205  }
206  return TRUE;
207
208#endif /* KMP_DEBUG */
209}
210
211/*!
212@ingroup THREAD_STATES
213@param loc Source location information.
214@return 1 if this thread is executing inside an active parallel region, zero if
215not.
216*/
217kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218  return __kmp_entry_thread()->th.th_root->r.r_active;
219}
220
221/*!
222@ingroup PARALLEL
223@param loc source location information
224@param global_tid global thread number
225@param num_threads number of threads requested for this parallel construct
226
227Set the number of threads to be used by the next fork spawned by this thread.
228This call is only required if the parallel construct has a `num_threads` clause.
229*/
230void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                             kmp_int32 num_threads) {
232  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                global_tid, num_threads));
234
235  __kmp_push_num_threads(loc, global_tid, num_threads);
236}
237
238void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240
241  /* the num_threads are automatically popped */
242}
243
244void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                           kmp_int32 proc_bind) {
246  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                proc_bind));
248
249  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250}
251
252/*!
253@ingroup PARALLEL
254@param loc  source location information
255@param argc  total number of arguments in the ellipsis
256@param microtask  pointer to callback routine consisting of outlined parallel
257construct
258@param ...  pointers to shared variables that aren't global
259
260Do the actual fork and call the microtask in the relevant number of threads.
261*/
262void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263  int gtid = __kmp_entry_gtid();
264
265#if (KMP_STATS_ENABLED)
266  // If we were in a serial region, then stop the serial timer, record
267  // the event, and start parallel region timer
268  stats_state_e previous_state = KMP_GET_THREAD_STATE();
269  if (previous_state == stats_state_e::SERIAL_REGION) {
270    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271  } else {
272    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273  }
274  int inParallel = __kmpc_in_parallel(loc);
275  if (inParallel) {
276    KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277  } else {
278    KMP_COUNT_BLOCK(OMP_PARALLEL);
279  }
280#endif
281
282  // maybe to save thr_state is enough here
283  {
284    va_list ap;
285    va_start(ap, microtask);
286
287#if OMPT_SUPPORT
288    ompt_frame_t *ompt_frame;
289    if (ompt_enabled.enabled) {
290      kmp_info_t *master_th = __kmp_threads[gtid];
291      kmp_team_t *parent_team = master_th->th.th_team;
292      ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293      if (lwt)
294        ompt_frame = &(lwt->ompt_task_info.frame);
295      else {
296        int tid = __kmp_tid_from_gtid(gtid);
297        ompt_frame = &(
298            parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299      }
300      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301      OMPT_STORE_RETURN_ADDRESS(gtid);
302    }
303#endif
304
305#if INCLUDE_SSC_MARKS
306    SSC_MARK_FORKING();
307#endif
308    __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                    VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                    VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311/* TODO: revert workaround for Intel(R) 64 tracker #96 */
312#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
313                    &ap
314#else
315                    ap
316#endif
317                    );
318#if INCLUDE_SSC_MARKS
319    SSC_MARK_JOINING();
320#endif
321    __kmp_join_call(loc, gtid
322#if OMPT_SUPPORT
323                    ,
324                    fork_context_intel
325#endif
326                    );
327
328    va_end(ap);
329  }
330
331#if KMP_STATS_ENABLED
332  if (previous_state == stats_state_e::SERIAL_REGION) {
333    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
334  } else {
335    KMP_POP_PARTITIONED_TIMER();
336  }
337#endif // KMP_STATS_ENABLED
338}
339
340/*!
341@ingroup PARALLEL
342@param loc source location information
343@param global_tid global thread number
344@param num_teams number of teams requested for the teams construct
345@param num_threads number of threads per team requested for the teams construct
346
347Set the number of teams to be used by the teams construct.
348This call is only required if the teams construct has a `num_teams` clause
349or a `thread_limit` clause (or both).
350*/
351void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
352                           kmp_int32 num_teams, kmp_int32 num_threads) {
353  KA_TRACE(20,
354           ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
355            global_tid, num_teams, num_threads));
356
357  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
358}
359
360/*!
361@ingroup PARALLEL
362@param loc  source location information
363@param argc  total number of arguments in the ellipsis
364@param microtask  pointer to callback routine consisting of outlined teams
365construct
366@param ...  pointers to shared variables that aren't global
367
368Do the actual fork and call the microtask in the relevant number of threads.
369*/
370void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
371                       ...) {
372  int gtid = __kmp_entry_gtid();
373  kmp_info_t *this_thr = __kmp_threads[gtid];
374  va_list ap;
375  va_start(ap, microtask);
376
377#if KMP_STATS_ENABLED
378  KMP_COUNT_BLOCK(OMP_TEAMS);
379  stats_state_e previous_state = KMP_GET_THREAD_STATE();
380  if (previous_state == stats_state_e::SERIAL_REGION) {
381    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
382  } else {
383    KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
384  }
385#endif
386
387  // remember teams entry point and nesting level
388  this_thr->th.th_teams_microtask = microtask;
389  this_thr->th.th_teams_level =
390      this_thr->th.th_team->t.t_level; // AC: can be >0 on host
391
392#if OMPT_SUPPORT
393  kmp_team_t *parent_team = this_thr->th.th_team;
394  int tid = __kmp_tid_from_gtid(gtid);
395  if (ompt_enabled.enabled) {
396    parent_team->t.t_implicit_task_taskdata[tid]
397        .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
398  }
399  OMPT_STORE_RETURN_ADDRESS(gtid);
400#endif
401
402  // check if __kmpc_push_num_teams called, set default number of teams
403  // otherwise
404  if (this_thr->th.th_teams_size.nteams == 0) {
405    __kmp_push_num_teams(loc, gtid, 0, 0);
406  }
407  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
410
411  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
412                  VOLATILE_CAST(microtask_t)
413                      __kmp_teams_master, // "wrapped" task
414                  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
415#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
416                  &ap
417#else
418                  ap
419#endif
420                  );
421  __kmp_join_call(loc, gtid
422#if OMPT_SUPPORT
423                  ,
424                  fork_context_intel
425#endif
426                  );
427
428  // Pop current CG root off list
429  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
431  this_thr->th.th_cg_roots = tmp->up;
432  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
433                 " to node %p. cg_nthreads was %d\n",
434                 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
435  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
436  int i = tmp->cg_nthreads--;
437  if (i == 1) { // check is we are the last thread in CG (not always the case)
438    __kmp_free(tmp);
439  }
440  // Restore current task's thread_limit from CG root
441  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
442  this_thr->th.th_current_task->td_icvs.thread_limit =
443      this_thr->th.th_cg_roots->cg_thread_limit;
444
445  this_thr->th.th_teams_microtask = NULL;
446  this_thr->th.th_teams_level = 0;
447  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
448  va_end(ap);
449#if KMP_STATS_ENABLED
450  if (previous_state == stats_state_e::SERIAL_REGION) {
451    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
452  } else {
453    KMP_POP_PARTITIONED_TIMER();
454  }
455#endif // KMP_STATS_ENABLED
456}
457
458// I don't think this function should ever have been exported.
459// The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
460// openmp code ever called it, but it's been exported from the RTL for so
461// long that I'm afraid to remove the definition.
462int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
463
464/*!
465@ingroup PARALLEL
466@param loc  source location information
467@param global_tid  global thread number
468
469Enter a serialized parallel construct. This interface is used to handle a
470conditional parallel region, like this,
471@code
472#pragma omp parallel if (condition)
473@endcode
474when the condition is false.
475*/
476void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
477// The implementation is now in kmp_runtime.cpp so that it can share static
478// functions with kmp_fork_call since the tasks to be done are similar in
479// each case.
480#if OMPT_SUPPORT
481  OMPT_STORE_RETURN_ADDRESS(global_tid);
482#endif
483  __kmp_serialized_parallel(loc, global_tid);
484}
485
486/*!
487@ingroup PARALLEL
488@param loc  source location information
489@param global_tid  global thread number
490
491Leave a serialized parallel construct.
492*/
493void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494  kmp_internal_control_t *top;
495  kmp_info_t *this_thr;
496  kmp_team_t *serial_team;
497
498  KC_TRACE(10,
499           ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
500
501  /* skip all this code for autopar serialized loops since it results in
502     unacceptable overhead */
503  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
504    return;
505
506  // Not autopar code
507  if (!TCR_4(__kmp_init_parallel))
508    __kmp_parallel_initialize();
509
510  __kmp_resume_if_soft_paused();
511
512  this_thr = __kmp_threads[global_tid];
513  serial_team = this_thr->th.th_serial_team;
514
515  kmp_task_team_t *task_team = this_thr->th.th_task_team;
516  // we need to wait for the proxy tasks before finishing the thread
517  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
518    __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
519
520  KMP_MB();
521  KMP_DEBUG_ASSERT(serial_team);
522  KMP_ASSERT(serial_team->t.t_serialized);
523  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
524  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
525  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
526  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
527
528#if OMPT_SUPPORT
529  if (ompt_enabled.enabled &&
530      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
531    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
532    if (ompt_enabled.ompt_callback_implicit_task) {
533      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
534          ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
535          OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
536    }
537
538    // reset clear the task id only after unlinking the task
539    ompt_data_t *parent_task_data;
540    __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
541
542    if (ompt_enabled.ompt_callback_parallel_end) {
543      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
544          &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
545          ompt_parallel_invoker_program | ompt_parallel_team,
546          OMPT_LOAD_RETURN_ADDRESS(global_tid));
547    }
548    __ompt_lw_taskteam_unlink(this_thr);
549    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
550  }
551#endif
552
553  /* If necessary, pop the internal control stack values and replace the team
554   * values */
555  top = serial_team->t.t_control_stack_top;
556  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
557    copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
558    serial_team->t.t_control_stack_top = top->next;
559    __kmp_free(top);
560  }
561
562  // if( serial_team -> t.t_serialized > 1 )
563  serial_team->t.t_level--;
564
565  /* pop dispatch buffers stack */
566  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
567  {
568    dispatch_private_info_t *disp_buffer =
569        serial_team->t.t_dispatch->th_disp_buffer;
570    serial_team->t.t_dispatch->th_disp_buffer =
571        serial_team->t.t_dispatch->th_disp_buffer->next;
572    __kmp_free(disp_buffer);
573  }
574  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
575
576  --serial_team->t.t_serialized;
577  if (serial_team->t.t_serialized == 0) {
578
579/* return to the parallel section */
580
581#if KMP_ARCH_X86 || KMP_ARCH_X86_64
582    if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
583      __kmp_clear_x87_fpu_status_word();
584      __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
585      __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
586    }
587#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
588
589    this_thr->th.th_team = serial_team->t.t_parent;
590    this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
591
592    /* restore values cached in the thread */
593    this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
594    this_thr->th.th_team_master =
595        serial_team->t.t_parent->t.t_threads[0]; /* JPH */
596    this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
597
598    /* TODO the below shouldn't need to be adjusted for serialized teams */
599    this_thr->th.th_dispatch =
600        &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
601
602    __kmp_pop_current_task_from_thread(this_thr);
603
604    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
605    this_thr->th.th_current_task->td_flags.executing = 1;
606
607    if (__kmp_tasking_mode != tskm_immediate_exec) {
608      // Copy the task team from the new child / old parent team to the thread.
609      this_thr->th.th_task_team =
610          this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
611      KA_TRACE(20,
612               ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
613                "team %p\n",
614                global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
615    }
616  } else {
617    if (__kmp_tasking_mode != tskm_immediate_exec) {
618      KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
619                    "depth of serial team %p to %d\n",
620                    global_tid, serial_team, serial_team->t.t_serialized));
621    }
622  }
623
624  if (__kmp_env_consistency_check)
625    __kmp_pop_parallel(global_tid, NULL);
626#if OMPT_SUPPORT
627  if (ompt_enabled.enabled)
628    this_thr->th.ompt_thread_info.state =
629        ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
630                                           : ompt_state_work_parallel);
631#endif
632}
633
634/*!
635@ingroup SYNCHRONIZATION
636@param loc  source location information.
637
638Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
639depending on the memory ordering convention obeyed by the compiler
640even that may not be necessary).
641*/
642void __kmpc_flush(ident_t *loc) {
643  KC_TRACE(10, ("__kmpc_flush: called\n"));
644
645  /* need explicit __mf() here since use volatile instead in library */
646  KMP_MB(); /* Flush all pending memory write invalidates.  */
647
648#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
649#if KMP_MIC
650// fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
651// We shouldn't need it, though, since the ABI rules require that
652// * If the compiler generates NGO stores it also generates the fence
653// * If users hand-code NGO stores they should insert the fence
654// therefore no incomplete unordered stores should be visible.
655#else
656  // C74404
657  // This is to address non-temporal store instructions (sfence needed).
658  // The clflush instruction is addressed either (mfence needed).
659  // Probably the non-temporal load monvtdqa instruction should also be
660  // addressed.
661  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
662  if (!__kmp_cpuinfo.initialized) {
663    __kmp_query_cpuid(&__kmp_cpuinfo);
664  }
665  if (!__kmp_cpuinfo.sse2) {
666    // CPU cannot execute SSE2 instructions.
667  } else {
668#if KMP_COMPILER_ICC
669    _mm_mfence();
670#elif KMP_COMPILER_MSVC
671    MemoryBarrier();
672#else
673    __sync_synchronize();
674#endif // KMP_COMPILER_ICC
675  }
676#endif // KMP_MIC
677#elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
678       KMP_ARCH_RISCV64)
679// Nothing to see here move along
680#elif KMP_ARCH_PPC64
681// Nothing needed here (we have a real MB above).
682#if KMP_OS_CNK
683  // The flushing thread needs to yield here; this prevents a
684  // busy-waiting thread from saturating the pipeline. flush is
685  // often used in loops like this:
686  // while (!flag) {
687  //   #pragma omp flush(flag)
688  // }
689  // and adding the yield here is good for at least a 10x speedup
690  // when running >2 threads per core (on the NAS LU benchmark).
691  __kmp_yield();
692#endif
693#else
694#error Unknown or unsupported architecture
695#endif
696
697#if OMPT_SUPPORT && OMPT_OPTIONAL
698  if (ompt_enabled.ompt_callback_flush) {
699    ompt_callbacks.ompt_callback(ompt_callback_flush)(
700        __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
701  }
702#endif
703}
704
705/* -------------------------------------------------------------------------- */
706/*!
707@ingroup SYNCHRONIZATION
708@param loc source location information
709@param global_tid thread id.
710
711Execute a barrier.
712*/
713void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
714  KMP_COUNT_BLOCK(OMP_BARRIER);
715  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
716
717  if (!TCR_4(__kmp_init_parallel))
718    __kmp_parallel_initialize();
719
720  __kmp_resume_if_soft_paused();
721
722  if (__kmp_env_consistency_check) {
723    if (loc == 0) {
724      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
725    }
726    __kmp_check_barrier(global_tid, ct_barrier, loc);
727  }
728
729#if OMPT_SUPPORT
730  ompt_frame_t *ompt_frame;
731  if (ompt_enabled.enabled) {
732    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
733    if (ompt_frame->enter_frame.ptr == NULL)
734      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
735    OMPT_STORE_RETURN_ADDRESS(global_tid);
736  }
737#endif
738  __kmp_threads[global_tid]->th.th_ident = loc;
739  // TODO: explicit barrier_wait_id:
740  //   this function is called when 'barrier' directive is present or
741  //   implicit barrier at the end of a worksharing construct.
742  // 1) better to add a per-thread barrier counter to a thread data structure
743  // 2) set to 0 when a new team is created
744  // 4) no sync is required
745
746  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
747#if OMPT_SUPPORT && OMPT_OPTIONAL
748  if (ompt_enabled.enabled) {
749    ompt_frame->enter_frame = ompt_data_none;
750  }
751#endif
752}
753
754/* The BARRIER for a MASTER section is always explicit   */
755/*!
756@ingroup WORK_SHARING
757@param loc  source location information.
758@param global_tid  global thread number .
759@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
760*/
761kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
762  int status = 0;
763
764  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
765
766  if (!TCR_4(__kmp_init_parallel))
767    __kmp_parallel_initialize();
768
769  __kmp_resume_if_soft_paused();
770
771  if (KMP_MASTER_GTID(global_tid)) {
772    KMP_COUNT_BLOCK(OMP_MASTER);
773    KMP_PUSH_PARTITIONED_TIMER(OMP_master);
774    status = 1;
775  }
776
777#if OMPT_SUPPORT && OMPT_OPTIONAL
778  if (status) {
779    if (ompt_enabled.ompt_callback_master) {
780      kmp_info_t *this_thr = __kmp_threads[global_tid];
781      kmp_team_t *team = this_thr->th.th_team;
782
783      int tid = __kmp_tid_from_gtid(global_tid);
784      ompt_callbacks.ompt_callback(ompt_callback_master)(
785          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
786          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
787          OMPT_GET_RETURN_ADDRESS(0));
788    }
789  }
790#endif
791
792  if (__kmp_env_consistency_check) {
793#if KMP_USE_DYNAMIC_LOCK
794    if (status)
795      __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
796    else
797      __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
798#else
799    if (status)
800      __kmp_push_sync(global_tid, ct_master, loc, NULL);
801    else
802      __kmp_check_sync(global_tid, ct_master, loc, NULL);
803#endif
804  }
805
806  return status;
807}
808
809/*!
810@ingroup WORK_SHARING
811@param loc  source location information.
812@param global_tid  global thread number .
813
814Mark the end of a <tt>master</tt> region. This should only be called by the
815thread that executes the <tt>master</tt> region.
816*/
817void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
818  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
819
820  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
821  KMP_POP_PARTITIONED_TIMER();
822
823#if OMPT_SUPPORT && OMPT_OPTIONAL
824  kmp_info_t *this_thr = __kmp_threads[global_tid];
825  kmp_team_t *team = this_thr->th.th_team;
826  if (ompt_enabled.ompt_callback_master) {
827    int tid = __kmp_tid_from_gtid(global_tid);
828    ompt_callbacks.ompt_callback(ompt_callback_master)(
829        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
830        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
831        OMPT_GET_RETURN_ADDRESS(0));
832  }
833#endif
834
835  if (__kmp_env_consistency_check) {
836    if (global_tid < 0)
837      KMP_WARNING(ThreadIdentInvalid);
838
839    if (KMP_MASTER_GTID(global_tid))
840      __kmp_pop_sync(global_tid, ct_master, loc);
841  }
842}
843
844/*!
845@ingroup WORK_SHARING
846@param loc  source location information.
847@param gtid  global thread number.
848
849Start execution of an <tt>ordered</tt> construct.
850*/
851void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
852  int cid = 0;
853  kmp_info_t *th;
854  KMP_DEBUG_ASSERT(__kmp_init_serial);
855
856  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
857
858  if (!TCR_4(__kmp_init_parallel))
859    __kmp_parallel_initialize();
860
861  __kmp_resume_if_soft_paused();
862
863#if USE_ITT_BUILD
864  __kmp_itt_ordered_prep(gtid);
865// TODO: ordered_wait_id
866#endif /* USE_ITT_BUILD */
867
868  th = __kmp_threads[gtid];
869
870#if OMPT_SUPPORT && OMPT_OPTIONAL
871  kmp_team_t *team;
872  ompt_wait_id_t lck;
873  void *codeptr_ra;
874  if (ompt_enabled.enabled) {
875    OMPT_STORE_RETURN_ADDRESS(gtid);
876    team = __kmp_team_from_gtid(gtid);
877    lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
878    /* OMPT state update */
879    th->th.ompt_thread_info.wait_id = lck;
880    th->th.ompt_thread_info.state = ompt_state_wait_ordered;
881
882    /* OMPT event callback */
883    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
884    if (ompt_enabled.ompt_callback_mutex_acquire) {
885      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
886          ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
887          codeptr_ra);
888    }
889  }
890#endif
891
892  if (th->th.th_dispatch->th_deo_fcn != 0)
893    (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
894  else
895    __kmp_parallel_deo(&gtid, &cid, loc);
896
897#if OMPT_SUPPORT && OMPT_OPTIONAL
898  if (ompt_enabled.enabled) {
899    /* OMPT state update */
900    th->th.ompt_thread_info.state = ompt_state_work_parallel;
901    th->th.ompt_thread_info.wait_id = 0;
902
903    /* OMPT event callback */
904    if (ompt_enabled.ompt_callback_mutex_acquired) {
905      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
906          ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
907    }
908  }
909#endif
910
911#if USE_ITT_BUILD
912  __kmp_itt_ordered_start(gtid);
913#endif /* USE_ITT_BUILD */
914}
915
916/*!
917@ingroup WORK_SHARING
918@param loc  source location information.
919@param gtid  global thread number.
920
921End execution of an <tt>ordered</tt> construct.
922*/
923void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
924  int cid = 0;
925  kmp_info_t *th;
926
927  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
928
929#if USE_ITT_BUILD
930  __kmp_itt_ordered_end(gtid);
931// TODO: ordered_wait_id
932#endif /* USE_ITT_BUILD */
933
934  th = __kmp_threads[gtid];
935
936  if (th->th.th_dispatch->th_dxo_fcn != 0)
937    (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
938  else
939    __kmp_parallel_dxo(&gtid, &cid, loc);
940
941#if OMPT_SUPPORT && OMPT_OPTIONAL
942  OMPT_STORE_RETURN_ADDRESS(gtid);
943  if (ompt_enabled.ompt_callback_mutex_released) {
944    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
945        ompt_mutex_ordered,
946        (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
947            ->t.t_ordered.dt.t_value,
948        OMPT_LOAD_RETURN_ADDRESS(gtid));
949  }
950#endif
951}
952
953#if KMP_USE_DYNAMIC_LOCK
954
955static __forceinline void
956__kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
957                          kmp_int32 gtid, kmp_indirect_locktag_t tag) {
958  // Pointer to the allocated indirect lock is written to crit, while indexing
959  // is ignored.
960  void *idx;
961  kmp_indirect_lock_t **lck;
962  lck = (kmp_indirect_lock_t **)crit;
963  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
964  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
965  KMP_SET_I_LOCK_LOCATION(ilk, loc);
966  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
967  KA_TRACE(20,
968           ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
969#if USE_ITT_BUILD
970  __kmp_itt_critical_creating(ilk->lock, loc);
971#endif
972  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
973  if (status == 0) {
974#if USE_ITT_BUILD
975    __kmp_itt_critical_destroyed(ilk->lock);
976#endif
977    // We don't really need to destroy the unclaimed lock here since it will be
978    // cleaned up at program exit.
979    // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
980  }
981  KMP_DEBUG_ASSERT(*lck != NULL);
982}
983
984// Fast-path acquire tas lock
985#define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
986  {                                                                            \
987    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
988    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
989    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
990    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
991        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
992      kmp_uint32 spins;                                                        \
993      KMP_FSYNC_PREPARE(l);                                                    \
994      KMP_INIT_YIELD(spins);                                                   \
995      kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
996      do {                                                                     \
997        if (TCR_4(__kmp_nth) >                                                 \
998            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
999          KMP_YIELD(TRUE);                                                     \
1000        } else {                                                               \
1001          KMP_YIELD_SPIN(spins);                                               \
1002        }                                                                      \
1003        __kmp_spin_backoff(&backoff);                                          \
1004      } while (                                                                \
1005          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1006          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1007    }                                                                          \
1008    KMP_FSYNC_ACQUIRED(l);                                                     \
1009  }
1010
1011// Fast-path test tas lock
1012#define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1013  {                                                                            \
1014    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1015    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1016    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1017    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1018         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1019  }
1020
1021// Fast-path release tas lock
1022#define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1023  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1024
1025#if KMP_USE_FUTEX
1026
1027#include <sys/syscall.h>
1028#include <unistd.h>
1029#ifndef FUTEX_WAIT
1030#define FUTEX_WAIT 0
1031#endif
1032#ifndef FUTEX_WAKE
1033#define FUTEX_WAKE 1
1034#endif
1035
1036// Fast-path acquire futex lock
1037#define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1038  {                                                                            \
1039    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1040    kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1041    KMP_MB();                                                                  \
1042    KMP_FSYNC_PREPARE(ftx);                                                    \
1043    kmp_int32 poll_val;                                                        \
1044    while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1045                &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1046                KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1047      kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1048      if (!cond) {                                                             \
1049        if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1050                                         poll_val |                            \
1051                                             KMP_LOCK_BUSY(1, futex))) {       \
1052          continue;                                                            \
1053        }                                                                      \
1054        poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1055      }                                                                        \
1056      kmp_int32 rc;                                                            \
1057      if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1058                        NULL, NULL, 0)) != 0) {                                \
1059        continue;                                                              \
1060      }                                                                        \
1061      gtid_code |= 1;                                                          \
1062    }                                                                          \
1063    KMP_FSYNC_ACQUIRED(ftx);                                                   \
1064  }
1065
1066// Fast-path test futex lock
1067#define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1068  {                                                                            \
1069    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1070    if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1071                                    KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1072      KMP_FSYNC_ACQUIRED(ftx);                                                 \
1073      rc = TRUE;                                                               \
1074    } else {                                                                   \
1075      rc = FALSE;                                                              \
1076    }                                                                          \
1077  }
1078
1079// Fast-path release futex lock
1080#define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1081  {                                                                            \
1082    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1083    KMP_MB();                                                                  \
1084    KMP_FSYNC_RELEASING(ftx);                                                  \
1085    kmp_int32 poll_val =                                                       \
1086        KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1087    if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1088      syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1089              KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1090    }                                                                          \
1091    KMP_MB();                                                                  \
1092    KMP_YIELD_OVERSUB();                                                       \
1093  }
1094
1095#endif // KMP_USE_FUTEX
1096
1097#else // KMP_USE_DYNAMIC_LOCK
1098
1099static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1100                                                      ident_t const *loc,
1101                                                      kmp_int32 gtid) {
1102  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1103
1104  // Because of the double-check, the following load doesn't need to be volatile
1105  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1106
1107  if (lck == NULL) {
1108    void *idx;
1109
1110    // Allocate & initialize the lock.
1111    // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1112    lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1113    __kmp_init_user_lock_with_checks(lck);
1114    __kmp_set_user_lock_location(lck, loc);
1115#if USE_ITT_BUILD
1116    __kmp_itt_critical_creating(lck);
1117// __kmp_itt_critical_creating() should be called *before* the first usage
1118// of underlying lock. It is the only place where we can guarantee it. There
1119// are chances the lock will destroyed with no usage, but it is not a
1120// problem, because this is not real event seen by user but rather setting
1121// name for object (lock). See more details in kmp_itt.h.
1122#endif /* USE_ITT_BUILD */
1123
1124    // Use a cmpxchg instruction to slam the start of the critical section with
1125    // the lock pointer.  If another thread beat us to it, deallocate the lock,
1126    // and use the lock that the other thread allocated.
1127    int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1128
1129    if (status == 0) {
1130// Deallocate the lock and reload the value.
1131#if USE_ITT_BUILD
1132      __kmp_itt_critical_destroyed(lck);
1133// Let ITT know the lock is destroyed and the same memory location may be reused
1134// for another purpose.
1135#endif /* USE_ITT_BUILD */
1136      __kmp_destroy_user_lock_with_checks(lck);
1137      __kmp_user_lock_free(&idx, gtid, lck);
1138      lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1139      KMP_DEBUG_ASSERT(lck != NULL);
1140    }
1141  }
1142  return lck;
1143}
1144
1145#endif // KMP_USE_DYNAMIC_LOCK
1146
1147/*!
1148@ingroup WORK_SHARING
1149@param loc  source location information.
1150@param global_tid  global thread number .
1151@param crit identity of the critical section. This could be a pointer to a lock
1152associated with the critical section, or some other suitably unique value.
1153
1154Enter code protected by a `critical` construct.
1155This function blocks until the executing thread can enter the critical section.
1156*/
1157void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1158                     kmp_critical_name *crit) {
1159#if KMP_USE_DYNAMIC_LOCK
1160#if OMPT_SUPPORT && OMPT_OPTIONAL
1161  OMPT_STORE_RETURN_ADDRESS(global_tid);
1162#endif // OMPT_SUPPORT
1163  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1164#else
1165  KMP_COUNT_BLOCK(OMP_CRITICAL);
1166#if OMPT_SUPPORT && OMPT_OPTIONAL
1167  ompt_state_t prev_state = ompt_state_undefined;
1168  ompt_thread_info_t ti;
1169#endif
1170  kmp_user_lock_p lck;
1171
1172  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1173
1174  // TODO: add THR_OVHD_STATE
1175
1176  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1177  KMP_CHECK_USER_LOCK_INIT();
1178
1179  if ((__kmp_user_lock_kind == lk_tas) &&
1180      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1181    lck = (kmp_user_lock_p)crit;
1182  }
1183#if KMP_USE_FUTEX
1184  else if ((__kmp_user_lock_kind == lk_futex) &&
1185           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1186    lck = (kmp_user_lock_p)crit;
1187  }
1188#endif
1189  else { // ticket, queuing or drdpa
1190    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1191  }
1192
1193  if (__kmp_env_consistency_check)
1194    __kmp_push_sync(global_tid, ct_critical, loc, lck);
1195
1196// since the critical directive binds to all threads, not just the current
1197// team we have to check this even if we are in a serialized team.
1198// also, even if we are the uber thread, we still have to conduct the lock,
1199// as we have to contend with sibling threads.
1200
1201#if USE_ITT_BUILD
1202  __kmp_itt_critical_acquiring(lck);
1203#endif /* USE_ITT_BUILD */
1204#if OMPT_SUPPORT && OMPT_OPTIONAL
1205  OMPT_STORE_RETURN_ADDRESS(gtid);
1206  void *codeptr_ra = NULL;
1207  if (ompt_enabled.enabled) {
1208    ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1209    /* OMPT state update */
1210    prev_state = ti.state;
1211    ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1212    ti.state = ompt_state_wait_critical;
1213
1214    /* OMPT event callback */
1215    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1216    if (ompt_enabled.ompt_callback_mutex_acquire) {
1217      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1218          ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1219          (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1220    }
1221  }
1222#endif
1223  // Value of 'crit' should be good for using as a critical_id of the critical
1224  // section directive.
1225  __kmp_acquire_user_lock_with_checks(lck, global_tid);
1226
1227#if USE_ITT_BUILD
1228  __kmp_itt_critical_acquired(lck);
1229#endif /* USE_ITT_BUILD */
1230#if OMPT_SUPPORT && OMPT_OPTIONAL
1231  if (ompt_enabled.enabled) {
1232    /* OMPT state update */
1233    ti.state = prev_state;
1234    ti.wait_id = 0;
1235
1236    /* OMPT event callback */
1237    if (ompt_enabled.ompt_callback_mutex_acquired) {
1238      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1239          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1240    }
1241  }
1242#endif
1243  KMP_POP_PARTITIONED_TIMER();
1244
1245  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1246  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1247#endif // KMP_USE_DYNAMIC_LOCK
1248}
1249
1250#if KMP_USE_DYNAMIC_LOCK
1251
1252// Converts the given hint to an internal lock implementation
1253static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1254#if KMP_USE_TSX
1255#define KMP_TSX_LOCK(seq) lockseq_##seq
1256#else
1257#define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1258#endif
1259
1260#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1261#define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1262#else
1263#define KMP_CPUINFO_RTM 0
1264#endif
1265
1266  // Hints that do not require further logic
1267  if (hint & kmp_lock_hint_hle)
1268    return KMP_TSX_LOCK(hle);
1269  if (hint & kmp_lock_hint_rtm)
1270    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1271  if (hint & kmp_lock_hint_adaptive)
1272    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1273
1274  // Rule out conflicting hints first by returning the default lock
1275  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1276    return __kmp_user_lock_seq;
1277  if ((hint & omp_lock_hint_speculative) &&
1278      (hint & omp_lock_hint_nonspeculative))
1279    return __kmp_user_lock_seq;
1280
1281  // Do not even consider speculation when it appears to be contended
1282  if (hint & omp_lock_hint_contended)
1283    return lockseq_queuing;
1284
1285  // Uncontended lock without speculation
1286  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1287    return lockseq_tas;
1288
1289  // HLE lock for speculation
1290  if (hint & omp_lock_hint_speculative)
1291    return KMP_TSX_LOCK(hle);
1292
1293  return __kmp_user_lock_seq;
1294}
1295
1296#if OMPT_SUPPORT && OMPT_OPTIONAL
1297#if KMP_USE_DYNAMIC_LOCK
1298static kmp_mutex_impl_t
1299__ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1300  if (user_lock) {
1301    switch (KMP_EXTRACT_D_TAG(user_lock)) {
1302    case 0:
1303      break;
1304#if KMP_USE_FUTEX
1305    case locktag_futex:
1306      return kmp_mutex_impl_queuing;
1307#endif
1308    case locktag_tas:
1309      return kmp_mutex_impl_spin;
1310#if KMP_USE_TSX
1311    case locktag_hle:
1312      return kmp_mutex_impl_speculative;
1313#endif
1314    default:
1315      return kmp_mutex_impl_none;
1316    }
1317    ilock = KMP_LOOKUP_I_LOCK(user_lock);
1318  }
1319  KMP_ASSERT(ilock);
1320  switch (ilock->type) {
1321#if KMP_USE_TSX
1322  case locktag_adaptive:
1323  case locktag_rtm:
1324    return kmp_mutex_impl_speculative;
1325#endif
1326  case locktag_nested_tas:
1327    return kmp_mutex_impl_spin;
1328#if KMP_USE_FUTEX
1329  case locktag_nested_futex:
1330#endif
1331  case locktag_ticket:
1332  case locktag_queuing:
1333  case locktag_drdpa:
1334  case locktag_nested_ticket:
1335  case locktag_nested_queuing:
1336  case locktag_nested_drdpa:
1337    return kmp_mutex_impl_queuing;
1338  default:
1339    return kmp_mutex_impl_none;
1340  }
1341}
1342#else
1343// For locks without dynamic binding
1344static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1345  switch (__kmp_user_lock_kind) {
1346  case lk_tas:
1347    return kmp_mutex_impl_spin;
1348#if KMP_USE_FUTEX
1349  case lk_futex:
1350#endif
1351  case lk_ticket:
1352  case lk_queuing:
1353  case lk_drdpa:
1354    return kmp_mutex_impl_queuing;
1355#if KMP_USE_TSX
1356  case lk_hle:
1357  case lk_rtm:
1358  case lk_adaptive:
1359    return kmp_mutex_impl_speculative;
1360#endif
1361  default:
1362    return kmp_mutex_impl_none;
1363  }
1364}
1365#endif // KMP_USE_DYNAMIC_LOCK
1366#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1367
1368/*!
1369@ingroup WORK_SHARING
1370@param loc  source location information.
1371@param global_tid  global thread number.
1372@param crit identity of the critical section. This could be a pointer to a lock
1373associated with the critical section, or some other suitably unique value.
1374@param hint the lock hint.
1375
1376Enter code protected by a `critical` construct with a hint. The hint value is
1377used to suggest a lock implementation. This function blocks until the executing
1378thread can enter the critical section unless the hint suggests use of
1379speculative execution and the hardware supports it.
1380*/
1381void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1382                               kmp_critical_name *crit, uint32_t hint) {
1383  KMP_COUNT_BLOCK(OMP_CRITICAL);
1384  kmp_user_lock_p lck;
1385#if OMPT_SUPPORT && OMPT_OPTIONAL
1386  ompt_state_t prev_state = ompt_state_undefined;
1387  ompt_thread_info_t ti;
1388  // This is the case, if called from __kmpc_critical:
1389  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1390  if (!codeptr)
1391    codeptr = OMPT_GET_RETURN_ADDRESS(0);
1392#endif
1393
1394  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1395
1396  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1397  // Check if it is initialized.
1398  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1399  if (*lk == 0) {
1400    kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1401    if (KMP_IS_D_LOCK(lckseq)) {
1402      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1403                                  KMP_GET_D_TAG(lckseq));
1404    } else {
1405      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1406    }
1407  }
1408  // Branch for accessing the actual lock object and set operation. This
1409  // branching is inevitable since this lock initialization does not follow the
1410  // normal dispatch path (lock table is not used).
1411  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1412    lck = (kmp_user_lock_p)lk;
1413    if (__kmp_env_consistency_check) {
1414      __kmp_push_sync(global_tid, ct_critical, loc, lck,
1415                      __kmp_map_hint_to_lock(hint));
1416    }
1417#if USE_ITT_BUILD
1418    __kmp_itt_critical_acquiring(lck);
1419#endif
1420#if OMPT_SUPPORT && OMPT_OPTIONAL
1421    if (ompt_enabled.enabled) {
1422      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1423      /* OMPT state update */
1424      prev_state = ti.state;
1425      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1426      ti.state = ompt_state_wait_critical;
1427
1428      /* OMPT event callback */
1429      if (ompt_enabled.ompt_callback_mutex_acquire) {
1430        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1431            ompt_mutex_critical, (unsigned int)hint,
1432            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1433            codeptr);
1434      }
1435    }
1436#endif
1437#if KMP_USE_INLINED_TAS
1438    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1439      KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1440    } else
1441#elif KMP_USE_INLINED_FUTEX
1442    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1443      KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1444    } else
1445#endif
1446    {
1447      KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1448    }
1449  } else {
1450    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1451    lck = ilk->lock;
1452    if (__kmp_env_consistency_check) {
1453      __kmp_push_sync(global_tid, ct_critical, loc, lck,
1454                      __kmp_map_hint_to_lock(hint));
1455    }
1456#if USE_ITT_BUILD
1457    __kmp_itt_critical_acquiring(lck);
1458#endif
1459#if OMPT_SUPPORT && OMPT_OPTIONAL
1460    if (ompt_enabled.enabled) {
1461      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1462      /* OMPT state update */
1463      prev_state = ti.state;
1464      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1465      ti.state = ompt_state_wait_critical;
1466
1467      /* OMPT event callback */
1468      if (ompt_enabled.ompt_callback_mutex_acquire) {
1469        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1470            ompt_mutex_critical, (unsigned int)hint,
1471            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1472            codeptr);
1473      }
1474    }
1475#endif
1476    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1477  }
1478  KMP_POP_PARTITIONED_TIMER();
1479
1480#if USE_ITT_BUILD
1481  __kmp_itt_critical_acquired(lck);
1482#endif /* USE_ITT_BUILD */
1483#if OMPT_SUPPORT && OMPT_OPTIONAL
1484  if (ompt_enabled.enabled) {
1485    /* OMPT state update */
1486    ti.state = prev_state;
1487    ti.wait_id = 0;
1488
1489    /* OMPT event callback */
1490    if (ompt_enabled.ompt_callback_mutex_acquired) {
1491      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1492          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1493    }
1494  }
1495#endif
1496
1497  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1498  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1499} // __kmpc_critical_with_hint
1500
1501#endif // KMP_USE_DYNAMIC_LOCK
1502
1503/*!
1504@ingroup WORK_SHARING
1505@param loc  source location information.
1506@param global_tid  global thread number .
1507@param crit identity of the critical section. This could be a pointer to a lock
1508associated with the critical section, or some other suitably unique value.
1509
1510Leave a critical section, releasing any lock that was held during its execution.
1511*/
1512void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1513                         kmp_critical_name *crit) {
1514  kmp_user_lock_p lck;
1515
1516  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1517
1518#if KMP_USE_DYNAMIC_LOCK
1519  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1520    lck = (kmp_user_lock_p)crit;
1521    KMP_ASSERT(lck != NULL);
1522    if (__kmp_env_consistency_check) {
1523      __kmp_pop_sync(global_tid, ct_critical, loc);
1524    }
1525#if USE_ITT_BUILD
1526    __kmp_itt_critical_releasing(lck);
1527#endif
1528#if KMP_USE_INLINED_TAS
1529    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1530      KMP_RELEASE_TAS_LOCK(lck, global_tid);
1531    } else
1532#elif KMP_USE_INLINED_FUTEX
1533    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1534      KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1535    } else
1536#endif
1537    {
1538      KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1539    }
1540  } else {
1541    kmp_indirect_lock_t *ilk =
1542        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1543    KMP_ASSERT(ilk != NULL);
1544    lck = ilk->lock;
1545    if (__kmp_env_consistency_check) {
1546      __kmp_pop_sync(global_tid, ct_critical, loc);
1547    }
1548#if USE_ITT_BUILD
1549    __kmp_itt_critical_releasing(lck);
1550#endif
1551    KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1552  }
1553
1554#else // KMP_USE_DYNAMIC_LOCK
1555
1556  if ((__kmp_user_lock_kind == lk_tas) &&
1557      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1558    lck = (kmp_user_lock_p)crit;
1559  }
1560#if KMP_USE_FUTEX
1561  else if ((__kmp_user_lock_kind == lk_futex) &&
1562           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1563    lck = (kmp_user_lock_p)crit;
1564  }
1565#endif
1566  else { // ticket, queuing or drdpa
1567    lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1568  }
1569
1570  KMP_ASSERT(lck != NULL);
1571
1572  if (__kmp_env_consistency_check)
1573    __kmp_pop_sync(global_tid, ct_critical, loc);
1574
1575#if USE_ITT_BUILD
1576  __kmp_itt_critical_releasing(lck);
1577#endif /* USE_ITT_BUILD */
1578  // Value of 'crit' should be good for using as a critical_id of the critical
1579  // section directive.
1580  __kmp_release_user_lock_with_checks(lck, global_tid);
1581
1582#endif // KMP_USE_DYNAMIC_LOCK
1583
1584#if OMPT_SUPPORT && OMPT_OPTIONAL
1585  /* OMPT release event triggers after lock is released; place here to trigger
1586   * for all #if branches */
1587  OMPT_STORE_RETURN_ADDRESS(global_tid);
1588  if (ompt_enabled.ompt_callback_mutex_released) {
1589    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1590        ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1591        OMPT_LOAD_RETURN_ADDRESS(0));
1592  }
1593#endif
1594
1595  KMP_POP_PARTITIONED_TIMER();
1596  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1597}
1598
1599/*!
1600@ingroup SYNCHRONIZATION
1601@param loc source location information
1602@param global_tid thread id.
1603@return one if the thread should execute the master block, zero otherwise
1604
1605Start execution of a combined barrier and master. The barrier is executed inside
1606this function.
1607*/
1608kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1609  int status;
1610
1611  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1612
1613  if (!TCR_4(__kmp_init_parallel))
1614    __kmp_parallel_initialize();
1615
1616  __kmp_resume_if_soft_paused();
1617
1618  if (__kmp_env_consistency_check)
1619    __kmp_check_barrier(global_tid, ct_barrier, loc);
1620
1621#if OMPT_SUPPORT
1622  ompt_frame_t *ompt_frame;
1623  if (ompt_enabled.enabled) {
1624    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1625    if (ompt_frame->enter_frame.ptr == NULL)
1626      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1627    OMPT_STORE_RETURN_ADDRESS(global_tid);
1628  }
1629#endif
1630#if USE_ITT_NOTIFY
1631  __kmp_threads[global_tid]->th.th_ident = loc;
1632#endif
1633  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1634#if OMPT_SUPPORT && OMPT_OPTIONAL
1635  if (ompt_enabled.enabled) {
1636    ompt_frame->enter_frame = ompt_data_none;
1637  }
1638#endif
1639
1640  return (status != 0) ? 0 : 1;
1641}
1642
1643/*!
1644@ingroup SYNCHRONIZATION
1645@param loc source location information
1646@param global_tid thread id.
1647
1648Complete the execution of a combined barrier and master. This function should
1649only be called at the completion of the <tt>master</tt> code. Other threads will
1650still be waiting at the barrier and this call releases them.
1651*/
1652void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1653  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1654
1655  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1656}
1657
1658/*!
1659@ingroup SYNCHRONIZATION
1660@param loc source location information
1661@param global_tid thread id.
1662@return one if the thread should execute the master block, zero otherwise
1663
1664Start execution of a combined barrier and master(nowait) construct.
1665The barrier is executed inside this function.
1666There is no equivalent "end" function, since the
1667*/
1668kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1669  kmp_int32 ret;
1670
1671  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1672
1673  if (!TCR_4(__kmp_init_parallel))
1674    __kmp_parallel_initialize();
1675
1676  __kmp_resume_if_soft_paused();
1677
1678  if (__kmp_env_consistency_check) {
1679    if (loc == 0) {
1680      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1681    }
1682    __kmp_check_barrier(global_tid, ct_barrier, loc);
1683  }
1684
1685#if OMPT_SUPPORT
1686  ompt_frame_t *ompt_frame;
1687  if (ompt_enabled.enabled) {
1688    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1689    if (ompt_frame->enter_frame.ptr == NULL)
1690      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1691    OMPT_STORE_RETURN_ADDRESS(global_tid);
1692  }
1693#endif
1694#if USE_ITT_NOTIFY
1695  __kmp_threads[global_tid]->th.th_ident = loc;
1696#endif
1697  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1698#if OMPT_SUPPORT && OMPT_OPTIONAL
1699  if (ompt_enabled.enabled) {
1700    ompt_frame->enter_frame = ompt_data_none;
1701  }
1702#endif
1703
1704  ret = __kmpc_master(loc, global_tid);
1705
1706  if (__kmp_env_consistency_check) {
1707    /*  there's no __kmpc_end_master called; so the (stats) */
1708    /*  actions of __kmpc_end_master are done here          */
1709
1710    if (global_tid < 0) {
1711      KMP_WARNING(ThreadIdentInvalid);
1712    }
1713    if (ret) {
1714      /* only one thread should do the pop since only */
1715      /* one did the push (see __kmpc_master())       */
1716
1717      __kmp_pop_sync(global_tid, ct_master, loc);
1718    }
1719  }
1720
1721  return (ret);
1722}
1723
1724/* The BARRIER for a SINGLE process section is always explicit   */
1725/*!
1726@ingroup WORK_SHARING
1727@param loc  source location information
1728@param global_tid  global thread number
1729@return One if this thread should execute the single construct, zero otherwise.
1730
1731Test whether to execute a <tt>single</tt> construct.
1732There are no implicit barriers in the two "single" calls, rather the compiler
1733should introduce an explicit barrier if it is required.
1734*/
1735
1736kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1737  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1738
1739  if (rc) {
1740    // We are going to execute the single statement, so we should count it.
1741    KMP_COUNT_BLOCK(OMP_SINGLE);
1742    KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1743  }
1744
1745#if OMPT_SUPPORT && OMPT_OPTIONAL
1746  kmp_info_t *this_thr = __kmp_threads[global_tid];
1747  kmp_team_t *team = this_thr->th.th_team;
1748  int tid = __kmp_tid_from_gtid(global_tid);
1749
1750  if (ompt_enabled.enabled) {
1751    if (rc) {
1752      if (ompt_enabled.ompt_callback_work) {
1753        ompt_callbacks.ompt_callback(ompt_callback_work)(
1754            ompt_work_single_executor, ompt_scope_begin,
1755            &(team->t.ompt_team_info.parallel_data),
1756            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1757            1, OMPT_GET_RETURN_ADDRESS(0));
1758      }
1759    } else {
1760      if (ompt_enabled.ompt_callback_work) {
1761        ompt_callbacks.ompt_callback(ompt_callback_work)(
1762            ompt_work_single_other, ompt_scope_begin,
1763            &(team->t.ompt_team_info.parallel_data),
1764            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1765            1, OMPT_GET_RETURN_ADDRESS(0));
1766        ompt_callbacks.ompt_callback(ompt_callback_work)(
1767            ompt_work_single_other, ompt_scope_end,
1768            &(team->t.ompt_team_info.parallel_data),
1769            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1770            1, OMPT_GET_RETURN_ADDRESS(0));
1771      }
1772    }
1773  }
1774#endif
1775
1776  return rc;
1777}
1778
1779/*!
1780@ingroup WORK_SHARING
1781@param loc  source location information
1782@param global_tid  global thread number
1783
1784Mark the end of a <tt>single</tt> construct.  This function should
1785only be called by the thread that executed the block of code protected
1786by the `single` construct.
1787*/
1788void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1789  __kmp_exit_single(global_tid);
1790  KMP_POP_PARTITIONED_TIMER();
1791
1792#if OMPT_SUPPORT && OMPT_OPTIONAL
1793  kmp_info_t *this_thr = __kmp_threads[global_tid];
1794  kmp_team_t *team = this_thr->th.th_team;
1795  int tid = __kmp_tid_from_gtid(global_tid);
1796
1797  if (ompt_enabled.ompt_callback_work) {
1798    ompt_callbacks.ompt_callback(ompt_callback_work)(
1799        ompt_work_single_executor, ompt_scope_end,
1800        &(team->t.ompt_team_info.parallel_data),
1801        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1802        OMPT_GET_RETURN_ADDRESS(0));
1803  }
1804#endif
1805}
1806
1807/*!
1808@ingroup WORK_SHARING
1809@param loc Source location
1810@param global_tid Global thread id
1811
1812Mark the end of a statically scheduled loop.
1813*/
1814void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1815  KMP_POP_PARTITIONED_TIMER();
1816  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1817
1818#if OMPT_SUPPORT && OMPT_OPTIONAL
1819  if (ompt_enabled.ompt_callback_work) {
1820    ompt_work_t ompt_work_type = ompt_work_loop;
1821    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1822    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1823    // Determine workshare type
1824    if (loc != NULL) {
1825      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1826        ompt_work_type = ompt_work_loop;
1827      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1828        ompt_work_type = ompt_work_sections;
1829      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1830        ompt_work_type = ompt_work_distribute;
1831      } else {
1832        // use default set above.
1833        // a warning about this case is provided in __kmpc_for_static_init
1834      }
1835      KMP_DEBUG_ASSERT(ompt_work_type);
1836    }
1837    ompt_callbacks.ompt_callback(ompt_callback_work)(
1838        ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1839        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1840  }
1841#endif
1842  if (__kmp_env_consistency_check)
1843    __kmp_pop_workshare(global_tid, ct_pdo, loc);
1844}
1845
1846// User routines which take C-style arguments (call by value)
1847// different from the Fortran equivalent routines
1848
1849void ompc_set_num_threads(int arg) {
1850  // !!!!! TODO: check the per-task binding
1851  __kmp_set_num_threads(arg, __kmp_entry_gtid());
1852}
1853
1854void ompc_set_dynamic(int flag) {
1855  kmp_info_t *thread;
1856
1857  /* For the thread-private implementation of the internal controls */
1858  thread = __kmp_entry_thread();
1859
1860  __kmp_save_internal_controls(thread);
1861
1862  set__dynamic(thread, flag ? TRUE : FALSE);
1863}
1864
1865void ompc_set_nested(int flag) {
1866  kmp_info_t *thread;
1867
1868  /* For the thread-private internal controls implementation */
1869  thread = __kmp_entry_thread();
1870
1871  __kmp_save_internal_controls(thread);
1872
1873  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1874}
1875
1876void ompc_set_max_active_levels(int max_active_levels) {
1877  /* TO DO */
1878  /* we want per-task implementation of this internal control */
1879
1880  /* For the per-thread internal controls implementation */
1881  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1882}
1883
1884void ompc_set_schedule(omp_sched_t kind, int modifier) {
1885  // !!!!! TODO: check the per-task binding
1886  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1887}
1888
1889int ompc_get_ancestor_thread_num(int level) {
1890  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1891}
1892
1893int ompc_get_team_size(int level) {
1894  return __kmp_get_team_size(__kmp_entry_gtid(), level);
1895}
1896
1897/* OpenMP 5.0 Affinity Format API */
1898
1899void ompc_set_affinity_format(char const *format) {
1900  if (!__kmp_init_serial) {
1901    __kmp_serial_initialize();
1902  }
1903  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1904                         format, KMP_STRLEN(format) + 1);
1905}
1906
1907size_t ompc_get_affinity_format(char *buffer, size_t size) {
1908  size_t format_size;
1909  if (!__kmp_init_serial) {
1910    __kmp_serial_initialize();
1911  }
1912  format_size = KMP_STRLEN(__kmp_affinity_format);
1913  if (buffer && size) {
1914    __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1915                           format_size + 1);
1916  }
1917  return format_size;
1918}
1919
1920void ompc_display_affinity(char const *format) {
1921  int gtid;
1922  if (!TCR_4(__kmp_init_middle)) {
1923    __kmp_middle_initialize();
1924  }
1925  gtid = __kmp_get_gtid();
1926  __kmp_aux_display_affinity(gtid, format);
1927}
1928
1929size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1930                             char const *format) {
1931  int gtid;
1932  size_t num_required;
1933  kmp_str_buf_t capture_buf;
1934  if (!TCR_4(__kmp_init_middle)) {
1935    __kmp_middle_initialize();
1936  }
1937  gtid = __kmp_get_gtid();
1938  __kmp_str_buf_init(&capture_buf);
1939  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1940  if (buffer && buf_size) {
1941    __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1942                           capture_buf.used + 1);
1943  }
1944  __kmp_str_buf_free(&capture_buf);
1945  return num_required;
1946}
1947
1948void kmpc_set_stacksize(int arg) {
1949  // __kmp_aux_set_stacksize initializes the library if needed
1950  __kmp_aux_set_stacksize(arg);
1951}
1952
1953void kmpc_set_stacksize_s(size_t arg) {
1954  // __kmp_aux_set_stacksize initializes the library if needed
1955  __kmp_aux_set_stacksize(arg);
1956}
1957
1958void kmpc_set_blocktime(int arg) {
1959  int gtid, tid;
1960  kmp_info_t *thread;
1961
1962  gtid = __kmp_entry_gtid();
1963  tid = __kmp_tid_from_gtid(gtid);
1964  thread = __kmp_thread_from_gtid(gtid);
1965
1966  __kmp_aux_set_blocktime(arg, thread, tid);
1967}
1968
1969void kmpc_set_library(int arg) {
1970  // __kmp_user_set_library initializes the library if needed
1971  __kmp_user_set_library((enum library_type)arg);
1972}
1973
1974void kmpc_set_defaults(char const *str) {
1975  // __kmp_aux_set_defaults initializes the library if needed
1976  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1977}
1978
1979void kmpc_set_disp_num_buffers(int arg) {
1980  // ignore after initialization because some teams have already
1981  // allocated dispatch buffers
1982  if (__kmp_init_serial == 0 && arg > 0)
1983    __kmp_dispatch_num_buffers = arg;
1984}
1985
1986int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1987#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1988  return -1;
1989#else
1990  if (!TCR_4(__kmp_init_middle)) {
1991    __kmp_middle_initialize();
1992  }
1993  return __kmp_aux_set_affinity_mask_proc(proc, mask);
1994#endif
1995}
1996
1997int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1998#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1999  return -1;
2000#else
2001  if (!TCR_4(__kmp_init_middle)) {
2002    __kmp_middle_initialize();
2003  }
2004  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2005#endif
2006}
2007
2008int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2009#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2010  return -1;
2011#else
2012  if (!TCR_4(__kmp_init_middle)) {
2013    __kmp_middle_initialize();
2014  }
2015  return __kmp_aux_get_affinity_mask_proc(proc, mask);
2016#endif
2017}
2018
2019/* -------------------------------------------------------------------------- */
2020/*!
2021@ingroup THREADPRIVATE
2022@param loc       source location information
2023@param gtid      global thread number
2024@param cpy_size  size of the cpy_data buffer
2025@param cpy_data  pointer to data to be copied
2026@param cpy_func  helper function to call for copying data
2027@param didit     flag variable: 1=single thread; 0=not single thread
2028
2029__kmpc_copyprivate implements the interface for the private data broadcast
2030needed for the copyprivate clause associated with a single region in an
2031OpenMP<sup>*</sup> program (both C and Fortran).
2032All threads participating in the parallel region call this routine.
2033One of the threads (called the single thread) should have the <tt>didit</tt>
2034variable set to 1 and all other threads should have that variable set to 0.
2035All threads pass a pointer to a data buffer (cpy_data) that they have built.
2036
2037The OpenMP specification forbids the use of nowait on the single region when a
2038copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2039barrier internally to avoid race conditions, so the code generation for the
2040single region should avoid generating a barrier after the call to @ref
2041__kmpc_copyprivate.
2042
2043The <tt>gtid</tt> parameter is the global thread id for the current thread.
2044The <tt>loc</tt> parameter is a pointer to source location information.
2045
2046Internal implementation: The single thread will first copy its descriptor
2047address (cpy_data) to a team-private location, then the other threads will each
2048call the function pointed to by the parameter cpy_func, which carries out the
2049copy by copying the data using the cpy_data buffer.
2050
2051The cpy_func routine used for the copy and the contents of the data area defined
2052by cpy_data and cpy_size may be built in any fashion that will allow the copy
2053to be done. For instance, the cpy_data buffer can hold the actual data to be
2054copied or it may hold a list of pointers to the data. The cpy_func routine must
2055interpret the cpy_data buffer appropriately.
2056
2057The interface to cpy_func is as follows:
2058@code
2059void cpy_func( void *destination, void *source )
2060@endcode
2061where void *destination is the cpy_data pointer for the thread being copied to
2062and void *source is the cpy_data pointer for the thread being copied from.
2063*/
2064void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2065                        void *cpy_data, void (*cpy_func)(void *, void *),
2066                        kmp_int32 didit) {
2067  void **data_ptr;
2068
2069  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2070
2071  KMP_MB();
2072
2073  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2074
2075  if (__kmp_env_consistency_check) {
2076    if (loc == 0) {
2077      KMP_WARNING(ConstructIdentInvalid);
2078    }
2079  }
2080
2081  // ToDo: Optimize the following two barriers into some kind of split barrier
2082
2083  if (didit)
2084    *data_ptr = cpy_data;
2085
2086#if OMPT_SUPPORT
2087  ompt_frame_t *ompt_frame;
2088  if (ompt_enabled.enabled) {
2089    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2090    if (ompt_frame->enter_frame.ptr == NULL)
2091      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2092    OMPT_STORE_RETURN_ADDRESS(gtid);
2093  }
2094#endif
2095/* This barrier is not a barrier region boundary */
2096#if USE_ITT_NOTIFY
2097  __kmp_threads[gtid]->th.th_ident = loc;
2098#endif
2099  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2100
2101  if (!didit)
2102    (*cpy_func)(cpy_data, *data_ptr);
2103
2104// Consider next barrier a user-visible barrier for barrier region boundaries
2105// Nesting checks are already handled by the single construct checks
2106
2107#if OMPT_SUPPORT
2108  if (ompt_enabled.enabled) {
2109    OMPT_STORE_RETURN_ADDRESS(gtid);
2110  }
2111#endif
2112#if USE_ITT_NOTIFY
2113  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2114// tasks can overwrite the location)
2115#endif
2116  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2117#if OMPT_SUPPORT && OMPT_OPTIONAL
2118  if (ompt_enabled.enabled) {
2119    ompt_frame->enter_frame = ompt_data_none;
2120  }
2121#endif
2122}
2123
2124/* -------------------------------------------------------------------------- */
2125
2126#define INIT_LOCK __kmp_init_user_lock_with_checks
2127#define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2128#define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2129#define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2130#define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2131#define ACQUIRE_NESTED_LOCK_TIMED                                              \
2132  __kmp_acquire_nested_user_lock_with_checks_timed
2133#define RELEASE_LOCK __kmp_release_user_lock_with_checks
2134#define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2135#define TEST_LOCK __kmp_test_user_lock_with_checks
2136#define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2137#define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2138#define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2139
2140// TODO: Make check abort messages use location info & pass it into
2141// with_checks routines
2142
2143#if KMP_USE_DYNAMIC_LOCK
2144
2145// internal lock initializer
2146static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2147                                                    kmp_dyna_lockseq_t seq) {
2148  if (KMP_IS_D_LOCK(seq)) {
2149    KMP_INIT_D_LOCK(lock, seq);
2150#if USE_ITT_BUILD
2151    __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2152#endif
2153  } else {
2154    KMP_INIT_I_LOCK(lock, seq);
2155#if USE_ITT_BUILD
2156    kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2157    __kmp_itt_lock_creating(ilk->lock, loc);
2158#endif
2159  }
2160}
2161
2162// internal nest lock initializer
2163static __forceinline void
2164__kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2165                               kmp_dyna_lockseq_t seq) {
2166#if KMP_USE_TSX
2167  // Don't have nested lock implementation for speculative locks
2168  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2169    seq = __kmp_user_lock_seq;
2170#endif
2171  switch (seq) {
2172  case lockseq_tas:
2173    seq = lockseq_nested_tas;
2174    break;
2175#if KMP_USE_FUTEX
2176  case lockseq_futex:
2177    seq = lockseq_nested_futex;
2178    break;
2179#endif
2180  case lockseq_ticket:
2181    seq = lockseq_nested_ticket;
2182    break;
2183  case lockseq_queuing:
2184    seq = lockseq_nested_queuing;
2185    break;
2186  case lockseq_drdpa:
2187    seq = lockseq_nested_drdpa;
2188    break;
2189  default:
2190    seq = lockseq_nested_queuing;
2191  }
2192  KMP_INIT_I_LOCK(lock, seq);
2193#if USE_ITT_BUILD
2194  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2195  __kmp_itt_lock_creating(ilk->lock, loc);
2196#endif
2197}
2198
2199/* initialize the lock with a hint */
2200void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2201                                uintptr_t hint) {
2202  KMP_DEBUG_ASSERT(__kmp_init_serial);
2203  if (__kmp_env_consistency_check && user_lock == NULL) {
2204    KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2205  }
2206
2207  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2208
2209#if OMPT_SUPPORT && OMPT_OPTIONAL
2210  // This is the case, if called from omp_init_lock_with_hint:
2211  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2212  if (!codeptr)
2213    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2214  if (ompt_enabled.ompt_callback_lock_init) {
2215    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2216        ompt_mutex_lock, (omp_lock_hint_t)hint,
2217        __ompt_get_mutex_impl_type(user_lock),
2218        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2219  }
2220#endif
2221}
2222
2223/* initialize the lock with a hint */
2224void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2225                                     void **user_lock, uintptr_t hint) {
2226  KMP_DEBUG_ASSERT(__kmp_init_serial);
2227  if (__kmp_env_consistency_check && user_lock == NULL) {
2228    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2229  }
2230
2231  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2232
2233#if OMPT_SUPPORT && OMPT_OPTIONAL
2234  // This is the case, if called from omp_init_lock_with_hint:
2235  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2236  if (!codeptr)
2237    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2238  if (ompt_enabled.ompt_callback_lock_init) {
2239    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2240        ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2241        __ompt_get_mutex_impl_type(user_lock),
2242        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2243  }
2244#endif
2245}
2246
2247#endif // KMP_USE_DYNAMIC_LOCK
2248
2249/* initialize the lock */
2250void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2251#if KMP_USE_DYNAMIC_LOCK
2252
2253  KMP_DEBUG_ASSERT(__kmp_init_serial);
2254  if (__kmp_env_consistency_check && user_lock == NULL) {
2255    KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2256  }
2257  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2258
2259#if OMPT_SUPPORT && OMPT_OPTIONAL
2260  // This is the case, if called from omp_init_lock_with_hint:
2261  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2262  if (!codeptr)
2263    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2264  if (ompt_enabled.ompt_callback_lock_init) {
2265    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2266        ompt_mutex_lock, omp_lock_hint_none,
2267        __ompt_get_mutex_impl_type(user_lock),
2268        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2269  }
2270#endif
2271
2272#else // KMP_USE_DYNAMIC_LOCK
2273
2274  static char const *const func = "omp_init_lock";
2275  kmp_user_lock_p lck;
2276  KMP_DEBUG_ASSERT(__kmp_init_serial);
2277
2278  if (__kmp_env_consistency_check) {
2279    if (user_lock == NULL) {
2280      KMP_FATAL(LockIsUninitialized, func);
2281    }
2282  }
2283
2284  KMP_CHECK_USER_LOCK_INIT();
2285
2286  if ((__kmp_user_lock_kind == lk_tas) &&
2287      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2288    lck = (kmp_user_lock_p)user_lock;
2289  }
2290#if KMP_USE_FUTEX
2291  else if ((__kmp_user_lock_kind == lk_futex) &&
2292           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2293    lck = (kmp_user_lock_p)user_lock;
2294  }
2295#endif
2296  else {
2297    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2298  }
2299  INIT_LOCK(lck);
2300  __kmp_set_user_lock_location(lck, loc);
2301
2302#if OMPT_SUPPORT && OMPT_OPTIONAL
2303  // This is the case, if called from omp_init_lock_with_hint:
2304  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2305  if (!codeptr)
2306    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2307  if (ompt_enabled.ompt_callback_lock_init) {
2308    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2309        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2310        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2311  }
2312#endif
2313
2314#if USE_ITT_BUILD
2315  __kmp_itt_lock_creating(lck);
2316#endif /* USE_ITT_BUILD */
2317
2318#endif // KMP_USE_DYNAMIC_LOCK
2319} // __kmpc_init_lock
2320
2321/* initialize the lock */
2322void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2323#if KMP_USE_DYNAMIC_LOCK
2324
2325  KMP_DEBUG_ASSERT(__kmp_init_serial);
2326  if (__kmp_env_consistency_check && user_lock == NULL) {
2327    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2328  }
2329  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2330
2331#if OMPT_SUPPORT && OMPT_OPTIONAL
2332  // This is the case, if called from omp_init_lock_with_hint:
2333  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2334  if (!codeptr)
2335    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2336  if (ompt_enabled.ompt_callback_lock_init) {
2337    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2338        ompt_mutex_nest_lock, omp_lock_hint_none,
2339        __ompt_get_mutex_impl_type(user_lock),
2340        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2341  }
2342#endif
2343
2344#else // KMP_USE_DYNAMIC_LOCK
2345
2346  static char const *const func = "omp_init_nest_lock";
2347  kmp_user_lock_p lck;
2348  KMP_DEBUG_ASSERT(__kmp_init_serial);
2349
2350  if (__kmp_env_consistency_check) {
2351    if (user_lock == NULL) {
2352      KMP_FATAL(LockIsUninitialized, func);
2353    }
2354  }
2355
2356  KMP_CHECK_USER_LOCK_INIT();
2357
2358  if ((__kmp_user_lock_kind == lk_tas) &&
2359      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2360       OMP_NEST_LOCK_T_SIZE)) {
2361    lck = (kmp_user_lock_p)user_lock;
2362  }
2363#if KMP_USE_FUTEX
2364  else if ((__kmp_user_lock_kind == lk_futex) &&
2365           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2366            OMP_NEST_LOCK_T_SIZE)) {
2367    lck = (kmp_user_lock_p)user_lock;
2368  }
2369#endif
2370  else {
2371    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2372  }
2373
2374  INIT_NESTED_LOCK(lck);
2375  __kmp_set_user_lock_location(lck, loc);
2376
2377#if OMPT_SUPPORT && OMPT_OPTIONAL
2378  // This is the case, if called from omp_init_lock_with_hint:
2379  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2380  if (!codeptr)
2381    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2382  if (ompt_enabled.ompt_callback_lock_init) {
2383    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2384        ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2385        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2386  }
2387#endif
2388
2389#if USE_ITT_BUILD
2390  __kmp_itt_lock_creating(lck);
2391#endif /* USE_ITT_BUILD */
2392
2393#endif // KMP_USE_DYNAMIC_LOCK
2394} // __kmpc_init_nest_lock
2395
2396void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2397#if KMP_USE_DYNAMIC_LOCK
2398
2399#if USE_ITT_BUILD
2400  kmp_user_lock_p lck;
2401  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2402    lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2403  } else {
2404    lck = (kmp_user_lock_p)user_lock;
2405  }
2406  __kmp_itt_lock_destroyed(lck);
2407#endif
2408#if OMPT_SUPPORT && OMPT_OPTIONAL
2409  // This is the case, if called from omp_init_lock_with_hint:
2410  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2411  if (!codeptr)
2412    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2413  if (ompt_enabled.ompt_callback_lock_destroy) {
2414    kmp_user_lock_p lck;
2415    if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2416      lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2417    } else {
2418      lck = (kmp_user_lock_p)user_lock;
2419    }
2420    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2421        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2422  }
2423#endif
2424  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2425#else
2426  kmp_user_lock_p lck;
2427
2428  if ((__kmp_user_lock_kind == lk_tas) &&
2429      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2430    lck = (kmp_user_lock_p)user_lock;
2431  }
2432#if KMP_USE_FUTEX
2433  else if ((__kmp_user_lock_kind == lk_futex) &&
2434           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2435    lck = (kmp_user_lock_p)user_lock;
2436  }
2437#endif
2438  else {
2439    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2440  }
2441
2442#if OMPT_SUPPORT && OMPT_OPTIONAL
2443  // This is the case, if called from omp_init_lock_with_hint:
2444  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2445  if (!codeptr)
2446    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2447  if (ompt_enabled.ompt_callback_lock_destroy) {
2448    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2449        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2450  }
2451#endif
2452
2453#if USE_ITT_BUILD
2454  __kmp_itt_lock_destroyed(lck);
2455#endif /* USE_ITT_BUILD */
2456  DESTROY_LOCK(lck);
2457
2458  if ((__kmp_user_lock_kind == lk_tas) &&
2459      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2460    ;
2461  }
2462#if KMP_USE_FUTEX
2463  else if ((__kmp_user_lock_kind == lk_futex) &&
2464           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2465    ;
2466  }
2467#endif
2468  else {
2469    __kmp_user_lock_free(user_lock, gtid, lck);
2470  }
2471#endif // KMP_USE_DYNAMIC_LOCK
2472} // __kmpc_destroy_lock
2473
2474/* destroy the lock */
2475void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2476#if KMP_USE_DYNAMIC_LOCK
2477
2478#if USE_ITT_BUILD
2479  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2480  __kmp_itt_lock_destroyed(ilk->lock);
2481#endif
2482#if OMPT_SUPPORT && OMPT_OPTIONAL
2483  // This is the case, if called from omp_init_lock_with_hint:
2484  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2485  if (!codeptr)
2486    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2487  if (ompt_enabled.ompt_callback_lock_destroy) {
2488    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2489        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2490  }
2491#endif
2492  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2493
2494#else // KMP_USE_DYNAMIC_LOCK
2495
2496  kmp_user_lock_p lck;
2497
2498  if ((__kmp_user_lock_kind == lk_tas) &&
2499      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2500       OMP_NEST_LOCK_T_SIZE)) {
2501    lck = (kmp_user_lock_p)user_lock;
2502  }
2503#if KMP_USE_FUTEX
2504  else if ((__kmp_user_lock_kind == lk_futex) &&
2505           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2506            OMP_NEST_LOCK_T_SIZE)) {
2507    lck = (kmp_user_lock_p)user_lock;
2508  }
2509#endif
2510  else {
2511    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2512  }
2513
2514#if OMPT_SUPPORT && OMPT_OPTIONAL
2515  // This is the case, if called from omp_init_lock_with_hint:
2516  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2517  if (!codeptr)
2518    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2519  if (ompt_enabled.ompt_callback_lock_destroy) {
2520    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2521        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2522  }
2523#endif
2524
2525#if USE_ITT_BUILD
2526  __kmp_itt_lock_destroyed(lck);
2527#endif /* USE_ITT_BUILD */
2528
2529  DESTROY_NESTED_LOCK(lck);
2530
2531  if ((__kmp_user_lock_kind == lk_tas) &&
2532      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2533       OMP_NEST_LOCK_T_SIZE)) {
2534    ;
2535  }
2536#if KMP_USE_FUTEX
2537  else if ((__kmp_user_lock_kind == lk_futex) &&
2538           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2539            OMP_NEST_LOCK_T_SIZE)) {
2540    ;
2541  }
2542#endif
2543  else {
2544    __kmp_user_lock_free(user_lock, gtid, lck);
2545  }
2546#endif // KMP_USE_DYNAMIC_LOCK
2547} // __kmpc_destroy_nest_lock
2548
2549void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2550  KMP_COUNT_BLOCK(OMP_set_lock);
2551#if KMP_USE_DYNAMIC_LOCK
2552  int tag = KMP_EXTRACT_D_TAG(user_lock);
2553#if USE_ITT_BUILD
2554  __kmp_itt_lock_acquiring(
2555      (kmp_user_lock_p)
2556          user_lock); // itt function will get to the right lock object.
2557#endif
2558#if OMPT_SUPPORT && OMPT_OPTIONAL
2559  // This is the case, if called from omp_init_lock_with_hint:
2560  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2561  if (!codeptr)
2562    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2563  if (ompt_enabled.ompt_callback_mutex_acquire) {
2564    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2565        ompt_mutex_lock, omp_lock_hint_none,
2566        __ompt_get_mutex_impl_type(user_lock),
2567        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2568  }
2569#endif
2570#if KMP_USE_INLINED_TAS
2571  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2572    KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2573  } else
2574#elif KMP_USE_INLINED_FUTEX
2575  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2576    KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2577  } else
2578#endif
2579  {
2580    __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2581  }
2582#if USE_ITT_BUILD
2583  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2584#endif
2585#if OMPT_SUPPORT && OMPT_OPTIONAL
2586  if (ompt_enabled.ompt_callback_mutex_acquired) {
2587    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2588        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2589  }
2590#endif
2591
2592#else // KMP_USE_DYNAMIC_LOCK
2593
2594  kmp_user_lock_p lck;
2595
2596  if ((__kmp_user_lock_kind == lk_tas) &&
2597      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2598    lck = (kmp_user_lock_p)user_lock;
2599  }
2600#if KMP_USE_FUTEX
2601  else if ((__kmp_user_lock_kind == lk_futex) &&
2602           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2603    lck = (kmp_user_lock_p)user_lock;
2604  }
2605#endif
2606  else {
2607    lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2608  }
2609
2610#if USE_ITT_BUILD
2611  __kmp_itt_lock_acquiring(lck);
2612#endif /* USE_ITT_BUILD */
2613#if OMPT_SUPPORT && OMPT_OPTIONAL
2614  // This is the case, if called from omp_init_lock_with_hint:
2615  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2616  if (!codeptr)
2617    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2618  if (ompt_enabled.ompt_callback_mutex_acquire) {
2619    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2620        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2621        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2622  }
2623#endif
2624
2625  ACQUIRE_LOCK(lck, gtid);
2626
2627#if USE_ITT_BUILD
2628  __kmp_itt_lock_acquired(lck);
2629#endif /* USE_ITT_BUILD */
2630
2631#if OMPT_SUPPORT && OMPT_OPTIONAL
2632  if (ompt_enabled.ompt_callback_mutex_acquired) {
2633    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2634        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2635  }
2636#endif
2637
2638#endif // KMP_USE_DYNAMIC_LOCK
2639}
2640
2641void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2642#if KMP_USE_DYNAMIC_LOCK
2643
2644#if USE_ITT_BUILD
2645  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2646#endif
2647#if OMPT_SUPPORT && OMPT_OPTIONAL
2648  // This is the case, if called from omp_init_lock_with_hint:
2649  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2650  if (!codeptr)
2651    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2652  if (ompt_enabled.enabled) {
2653    if (ompt_enabled.ompt_callback_mutex_acquire) {
2654      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2655          ompt_mutex_nest_lock, omp_lock_hint_none,
2656          __ompt_get_mutex_impl_type(user_lock),
2657          (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2658    }
2659  }
2660#endif
2661  int acquire_status =
2662      KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2663  (void) acquire_status;
2664#if USE_ITT_BUILD
2665  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2666#endif
2667
2668#if OMPT_SUPPORT && OMPT_OPTIONAL
2669  if (ompt_enabled.enabled) {
2670    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2671      if (ompt_enabled.ompt_callback_mutex_acquired) {
2672        // lock_first
2673        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2674            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2675            codeptr);
2676      }
2677    } else {
2678      if (ompt_enabled.ompt_callback_nest_lock) {
2679        // lock_next
2680        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2681            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2682      }
2683    }
2684  }
2685#endif
2686
2687#else // KMP_USE_DYNAMIC_LOCK
2688  int acquire_status;
2689  kmp_user_lock_p lck;
2690
2691  if ((__kmp_user_lock_kind == lk_tas) &&
2692      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2693       OMP_NEST_LOCK_T_SIZE)) {
2694    lck = (kmp_user_lock_p)user_lock;
2695  }
2696#if KMP_USE_FUTEX
2697  else if ((__kmp_user_lock_kind == lk_futex) &&
2698           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2699            OMP_NEST_LOCK_T_SIZE)) {
2700    lck = (kmp_user_lock_p)user_lock;
2701  }
2702#endif
2703  else {
2704    lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2705  }
2706
2707#if USE_ITT_BUILD
2708  __kmp_itt_lock_acquiring(lck);
2709#endif /* USE_ITT_BUILD */
2710#if OMPT_SUPPORT && OMPT_OPTIONAL
2711  // This is the case, if called from omp_init_lock_with_hint:
2712  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2713  if (!codeptr)
2714    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2715  if (ompt_enabled.enabled) {
2716    if (ompt_enabled.ompt_callback_mutex_acquire) {
2717      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2718          ompt_mutex_nest_lock, omp_lock_hint_none,
2719          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2720          codeptr);
2721    }
2722  }
2723#endif
2724
2725  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2726
2727#if USE_ITT_BUILD
2728  __kmp_itt_lock_acquired(lck);
2729#endif /* USE_ITT_BUILD */
2730
2731#if OMPT_SUPPORT && OMPT_OPTIONAL
2732  if (ompt_enabled.enabled) {
2733    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2734      if (ompt_enabled.ompt_callback_mutex_acquired) {
2735        // lock_first
2736        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2737            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2738      }
2739    } else {
2740      if (ompt_enabled.ompt_callback_nest_lock) {
2741        // lock_next
2742        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2743            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2744      }
2745    }
2746  }
2747#endif
2748
2749#endif // KMP_USE_DYNAMIC_LOCK
2750}
2751
2752void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2753#if KMP_USE_DYNAMIC_LOCK
2754
2755  int tag = KMP_EXTRACT_D_TAG(user_lock);
2756#if USE_ITT_BUILD
2757  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2758#endif
2759#if KMP_USE_INLINED_TAS
2760  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2761    KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2762  } else
2763#elif KMP_USE_INLINED_FUTEX
2764  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2765    KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2766  } else
2767#endif
2768  {
2769    __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2770  }
2771
2772#if OMPT_SUPPORT && OMPT_OPTIONAL
2773  // This is the case, if called from omp_init_lock_with_hint:
2774  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2775  if (!codeptr)
2776    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2777  if (ompt_enabled.ompt_callback_mutex_released) {
2778    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2779        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2780  }
2781#endif
2782
2783#else // KMP_USE_DYNAMIC_LOCK
2784
2785  kmp_user_lock_p lck;
2786
2787  /* Can't use serial interval since not block structured */
2788  /* release the lock */
2789
2790  if ((__kmp_user_lock_kind == lk_tas) &&
2791      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2792#if KMP_OS_LINUX &&                                                            \
2793    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2794// "fast" path implemented to fix customer performance issue
2795#if USE_ITT_BUILD
2796    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2797#endif /* USE_ITT_BUILD */
2798    TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2799    KMP_MB();
2800
2801#if OMPT_SUPPORT && OMPT_OPTIONAL
2802    // This is the case, if called from omp_init_lock_with_hint:
2803    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2804    if (!codeptr)
2805      codeptr = OMPT_GET_RETURN_ADDRESS(0);
2806    if (ompt_enabled.ompt_callback_mutex_released) {
2807      ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2808          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2809    }
2810#endif
2811
2812    return;
2813#else
2814    lck = (kmp_user_lock_p)user_lock;
2815#endif
2816  }
2817#if KMP_USE_FUTEX
2818  else if ((__kmp_user_lock_kind == lk_futex) &&
2819           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2820    lck = (kmp_user_lock_p)user_lock;
2821  }
2822#endif
2823  else {
2824    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2825  }
2826
2827#if USE_ITT_BUILD
2828  __kmp_itt_lock_releasing(lck);
2829#endif /* USE_ITT_BUILD */
2830
2831  RELEASE_LOCK(lck, gtid);
2832
2833#if OMPT_SUPPORT && OMPT_OPTIONAL
2834  // This is the case, if called from omp_init_lock_with_hint:
2835  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2836  if (!codeptr)
2837    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2838  if (ompt_enabled.ompt_callback_mutex_released) {
2839    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2840        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2841  }
2842#endif
2843
2844#endif // KMP_USE_DYNAMIC_LOCK
2845}
2846
2847/* release the lock */
2848void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2849#if KMP_USE_DYNAMIC_LOCK
2850
2851#if USE_ITT_BUILD
2852  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2853#endif
2854  int release_status =
2855      KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2856  (void) release_status;
2857
2858#if OMPT_SUPPORT && OMPT_OPTIONAL
2859  // This is the case, if called from omp_init_lock_with_hint:
2860  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2861  if (!codeptr)
2862    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2863  if (ompt_enabled.enabled) {
2864    if (release_status == KMP_LOCK_RELEASED) {
2865      if (ompt_enabled.ompt_callback_mutex_released) {
2866        // release_lock_last
2867        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2868            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2869            codeptr);
2870      }
2871    } else if (ompt_enabled.ompt_callback_nest_lock) {
2872      // release_lock_prev
2873      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2874          ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2875    }
2876  }
2877#endif
2878
2879#else // KMP_USE_DYNAMIC_LOCK
2880
2881  kmp_user_lock_p lck;
2882
2883  /* Can't use serial interval since not block structured */
2884
2885  if ((__kmp_user_lock_kind == lk_tas) &&
2886      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2887       OMP_NEST_LOCK_T_SIZE)) {
2888#if KMP_OS_LINUX &&                                                            \
2889    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2890    // "fast" path implemented to fix customer performance issue
2891    kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2892#if USE_ITT_BUILD
2893    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2894#endif /* USE_ITT_BUILD */
2895
2896#if OMPT_SUPPORT && OMPT_OPTIONAL
2897    int release_status = KMP_LOCK_STILL_HELD;
2898#endif
2899
2900    if (--(tl->lk.depth_locked) == 0) {
2901      TCW_4(tl->lk.poll, 0);
2902#if OMPT_SUPPORT && OMPT_OPTIONAL
2903      release_status = KMP_LOCK_RELEASED;
2904#endif
2905    }
2906    KMP_MB();
2907
2908#if OMPT_SUPPORT && OMPT_OPTIONAL
2909    // This is the case, if called from omp_init_lock_with_hint:
2910    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2911    if (!codeptr)
2912      codeptr = OMPT_GET_RETURN_ADDRESS(0);
2913    if (ompt_enabled.enabled) {
2914      if (release_status == KMP_LOCK_RELEASED) {
2915        if (ompt_enabled.ompt_callback_mutex_released) {
2916          // release_lock_last
2917          ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2918              ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2919        }
2920      } else if (ompt_enabled.ompt_callback_nest_lock) {
2921        // release_lock_previous
2922        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2923            ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2924      }
2925    }
2926#endif
2927
2928    return;
2929#else
2930    lck = (kmp_user_lock_p)user_lock;
2931#endif
2932  }
2933#if KMP_USE_FUTEX
2934  else if ((__kmp_user_lock_kind == lk_futex) &&
2935           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2936            OMP_NEST_LOCK_T_SIZE)) {
2937    lck = (kmp_user_lock_p)user_lock;
2938  }
2939#endif
2940  else {
2941    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2942  }
2943
2944#if USE_ITT_BUILD
2945  __kmp_itt_lock_releasing(lck);
2946#endif /* USE_ITT_BUILD */
2947
2948  int release_status;
2949  release_status = RELEASE_NESTED_LOCK(lck, gtid);
2950#if OMPT_SUPPORT && OMPT_OPTIONAL
2951  // This is the case, if called from omp_init_lock_with_hint:
2952  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2953  if (!codeptr)
2954    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2955  if (ompt_enabled.enabled) {
2956    if (release_status == KMP_LOCK_RELEASED) {
2957      if (ompt_enabled.ompt_callback_mutex_released) {
2958        // release_lock_last
2959        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2960            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2961      }
2962    } else if (ompt_enabled.ompt_callback_nest_lock) {
2963      // release_lock_previous
2964      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2965          ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2966    }
2967  }
2968#endif
2969
2970#endif // KMP_USE_DYNAMIC_LOCK
2971}
2972
2973/* try to acquire the lock */
2974int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2975  KMP_COUNT_BLOCK(OMP_test_lock);
2976
2977#if KMP_USE_DYNAMIC_LOCK
2978  int rc;
2979  int tag = KMP_EXTRACT_D_TAG(user_lock);
2980#if USE_ITT_BUILD
2981  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2982#endif
2983#if OMPT_SUPPORT && OMPT_OPTIONAL
2984  // This is the case, if called from omp_init_lock_with_hint:
2985  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2986  if (!codeptr)
2987    codeptr = OMPT_GET_RETURN_ADDRESS(0);
2988  if (ompt_enabled.ompt_callback_mutex_acquire) {
2989    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2990        ompt_mutex_lock, omp_lock_hint_none,
2991        __ompt_get_mutex_impl_type(user_lock),
2992        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2993  }
2994#endif
2995#if KMP_USE_INLINED_TAS
2996  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2997    KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2998  } else
2999#elif KMP_USE_INLINED_FUTEX
3000  if (tag == locktag_futex && !__kmp_env_consistency_check) {
3001    KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3002  } else
3003#endif
3004  {
3005    rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3006  }
3007  if (rc) {
3008#if USE_ITT_BUILD
3009    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3010#endif
3011#if OMPT_SUPPORT && OMPT_OPTIONAL
3012    if (ompt_enabled.ompt_callback_mutex_acquired) {
3013      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3014          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3015    }
3016#endif
3017    return FTN_TRUE;
3018  } else {
3019#if USE_ITT_BUILD
3020    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3021#endif
3022    return FTN_FALSE;
3023  }
3024
3025#else // KMP_USE_DYNAMIC_LOCK
3026
3027  kmp_user_lock_p lck;
3028  int rc;
3029
3030  if ((__kmp_user_lock_kind == lk_tas) &&
3031      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3032    lck = (kmp_user_lock_p)user_lock;
3033  }
3034#if KMP_USE_FUTEX
3035  else if ((__kmp_user_lock_kind == lk_futex) &&
3036           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3037    lck = (kmp_user_lock_p)user_lock;
3038  }
3039#endif
3040  else {
3041    lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3042  }
3043
3044#if USE_ITT_BUILD
3045  __kmp_itt_lock_acquiring(lck);
3046#endif /* USE_ITT_BUILD */
3047#if OMPT_SUPPORT && OMPT_OPTIONAL
3048  // This is the case, if called from omp_init_lock_with_hint:
3049  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3050  if (!codeptr)
3051    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3052  if (ompt_enabled.ompt_callback_mutex_acquire) {
3053    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3054        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3055        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3056  }
3057#endif
3058
3059  rc = TEST_LOCK(lck, gtid);
3060#if USE_ITT_BUILD
3061  if (rc) {
3062    __kmp_itt_lock_acquired(lck);
3063  } else {
3064    __kmp_itt_lock_cancelled(lck);
3065  }
3066#endif /* USE_ITT_BUILD */
3067#if OMPT_SUPPORT && OMPT_OPTIONAL
3068  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3069    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3070        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3071  }
3072#endif
3073
3074  return (rc ? FTN_TRUE : FTN_FALSE);
3075
3076/* Can't use serial interval since not block structured */
3077
3078#endif // KMP_USE_DYNAMIC_LOCK
3079}
3080
3081/* try to acquire the lock */
3082int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3083#if KMP_USE_DYNAMIC_LOCK
3084  int rc;
3085#if USE_ITT_BUILD
3086  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3087#endif
3088#if OMPT_SUPPORT && OMPT_OPTIONAL
3089  // This is the case, if called from omp_init_lock_with_hint:
3090  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3091  if (!codeptr)
3092    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3093  if (ompt_enabled.ompt_callback_mutex_acquire) {
3094    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3095        ompt_mutex_nest_lock, omp_lock_hint_none,
3096        __ompt_get_mutex_impl_type(user_lock),
3097        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3098  }
3099#endif
3100  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3101#if USE_ITT_BUILD
3102  if (rc) {
3103    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3104  } else {
3105    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3106  }
3107#endif
3108#if OMPT_SUPPORT && OMPT_OPTIONAL
3109  if (ompt_enabled.enabled && rc) {
3110    if (rc == 1) {
3111      if (ompt_enabled.ompt_callback_mutex_acquired) {
3112        // lock_first
3113        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3114            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3115            codeptr);
3116      }
3117    } else {
3118      if (ompt_enabled.ompt_callback_nest_lock) {
3119        // lock_next
3120        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3121            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3122      }
3123    }
3124  }
3125#endif
3126  return rc;
3127
3128#else // KMP_USE_DYNAMIC_LOCK
3129
3130  kmp_user_lock_p lck;
3131  int rc;
3132
3133  if ((__kmp_user_lock_kind == lk_tas) &&
3134      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3135       OMP_NEST_LOCK_T_SIZE)) {
3136    lck = (kmp_user_lock_p)user_lock;
3137  }
3138#if KMP_USE_FUTEX
3139  else if ((__kmp_user_lock_kind == lk_futex) &&
3140           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3141            OMP_NEST_LOCK_T_SIZE)) {
3142    lck = (kmp_user_lock_p)user_lock;
3143  }
3144#endif
3145  else {
3146    lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3147  }
3148
3149#if USE_ITT_BUILD
3150  __kmp_itt_lock_acquiring(lck);
3151#endif /* USE_ITT_BUILD */
3152
3153#if OMPT_SUPPORT && OMPT_OPTIONAL
3154  // This is the case, if called from omp_init_lock_with_hint:
3155  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3156  if (!codeptr)
3157    codeptr = OMPT_GET_RETURN_ADDRESS(0);
3158  if (ompt_enabled.enabled) &&
3159        ompt_enabled.ompt_callback_mutex_acquire) {
3160      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3161          ompt_mutex_nest_lock, omp_lock_hint_none,
3162          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3163          codeptr);
3164    }
3165#endif
3166
3167  rc = TEST_NESTED_LOCK(lck, gtid);
3168#if USE_ITT_BUILD
3169  if (rc) {
3170    __kmp_itt_lock_acquired(lck);
3171  } else {
3172    __kmp_itt_lock_cancelled(lck);
3173  }
3174#endif /* USE_ITT_BUILD */
3175#if OMPT_SUPPORT && OMPT_OPTIONAL
3176  if (ompt_enabled.enabled && rc) {
3177    if (rc == 1) {
3178      if (ompt_enabled.ompt_callback_mutex_acquired) {
3179        // lock_first
3180        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3181            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3182      }
3183    } else {
3184      if (ompt_enabled.ompt_callback_nest_lock) {
3185        // lock_next
3186        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3187            ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3188      }
3189    }
3190  }
3191#endif
3192  return rc;
3193
3194/* Can't use serial interval since not block structured */
3195
3196#endif // KMP_USE_DYNAMIC_LOCK
3197}
3198
3199// Interface to fast scalable reduce methods routines
3200
3201// keep the selected method in a thread local structure for cross-function
3202// usage: will be used in __kmpc_end_reduce* functions;
3203// another solution: to re-determine the method one more time in
3204// __kmpc_end_reduce* functions (new prototype required then)
3205// AT: which solution is better?
3206#define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3207  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3208
3209#define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3210  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3211
3212// description of the packed_reduction_method variable: look at the macros in
3213// kmp.h
3214
3215// used in a critical section reduce block
3216static __forceinline void
3217__kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3218                                          kmp_critical_name *crit) {
3219
3220  // this lock was visible to a customer and to the threading profile tool as a
3221  // serial overhead span (although it's used for an internal purpose only)
3222  //            why was it visible in previous implementation?
3223  //            should we keep it visible in new reduce block?
3224  kmp_user_lock_p lck;
3225
3226#if KMP_USE_DYNAMIC_LOCK
3227
3228  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3229  // Check if it is initialized.
3230  if (*lk == 0) {
3231    if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3232      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3233                                  KMP_GET_D_TAG(__kmp_user_lock_seq));
3234    } else {
3235      __kmp_init_indirect_csptr(crit, loc, global_tid,
3236                                KMP_GET_I_TAG(__kmp_user_lock_seq));
3237    }
3238  }
3239  // Branch for accessing the actual lock object and set operation. This
3240  // branching is inevitable since this lock initialization does not follow the
3241  // normal dispatch path (lock table is not used).
3242  if (KMP_EXTRACT_D_TAG(lk) != 0) {
3243    lck = (kmp_user_lock_p)lk;
3244    KMP_DEBUG_ASSERT(lck != NULL);
3245    if (__kmp_env_consistency_check) {
3246      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3247    }
3248    KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3249  } else {
3250    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3251    lck = ilk->lock;
3252    KMP_DEBUG_ASSERT(lck != NULL);
3253    if (__kmp_env_consistency_check) {
3254      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3255    }
3256    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3257  }
3258
3259#else // KMP_USE_DYNAMIC_LOCK
3260
3261  // We know that the fast reduction code is only emitted by Intel compilers
3262  // with 32 byte critical sections. If there isn't enough space, then we
3263  // have to use a pointer.
3264  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3265    lck = (kmp_user_lock_p)crit;
3266  } else {
3267    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3268  }
3269  KMP_DEBUG_ASSERT(lck != NULL);
3270
3271  if (__kmp_env_consistency_check)
3272    __kmp_push_sync(global_tid, ct_critical, loc, lck);
3273
3274  __kmp_acquire_user_lock_with_checks(lck, global_tid);
3275
3276#endif // KMP_USE_DYNAMIC_LOCK
3277}
3278
3279// used in a critical section reduce block
3280static __forceinline void
3281__kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3282                                        kmp_critical_name *crit) {
3283
3284  kmp_user_lock_p lck;
3285
3286#if KMP_USE_DYNAMIC_LOCK
3287
3288  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3289    lck = (kmp_user_lock_p)crit;
3290    if (__kmp_env_consistency_check)
3291      __kmp_pop_sync(global_tid, ct_critical, loc);
3292    KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3293  } else {
3294    kmp_indirect_lock_t *ilk =
3295        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3296    if (__kmp_env_consistency_check)
3297      __kmp_pop_sync(global_tid, ct_critical, loc);
3298    KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3299  }
3300
3301#else // KMP_USE_DYNAMIC_LOCK
3302
3303  // We know that the fast reduction code is only emitted by Intel compilers
3304  // with 32 byte critical sections. If there isn't enough space, then we have
3305  // to use a pointer.
3306  if (__kmp_base_user_lock_size > 32) {
3307    lck = *((kmp_user_lock_p *)crit);
3308    KMP_ASSERT(lck != NULL);
3309  } else {
3310    lck = (kmp_user_lock_p)crit;
3311  }
3312
3313  if (__kmp_env_consistency_check)
3314    __kmp_pop_sync(global_tid, ct_critical, loc);
3315
3316  __kmp_release_user_lock_with_checks(lck, global_tid);
3317
3318#endif // KMP_USE_DYNAMIC_LOCK
3319} // __kmp_end_critical_section_reduce_block
3320
3321static __forceinline int
3322__kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3323                                     int *task_state) {
3324  kmp_team_t *team;
3325
3326  // Check if we are inside the teams construct?
3327  if (th->th.th_teams_microtask) {
3328    *team_p = team = th->th.th_team;
3329    if (team->t.t_level == th->th.th_teams_level) {
3330      // This is reduction at teams construct.
3331      KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3332      // Let's swap teams temporarily for the reduction.
3333      th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3334      th->th.th_team = team->t.t_parent;
3335      th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3336      th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3337      *task_state = th->th.th_task_state;
3338      th->th.th_task_state = 0;
3339
3340      return 1;
3341    }
3342  }
3343  return 0;
3344}
3345
3346static __forceinline void
3347__kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3348  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3349  th->th.th_info.ds.ds_tid = 0;
3350  th->th.th_team = team;
3351  th->th.th_team_nproc = team->t.t_nproc;
3352  th->th.th_task_team = team->t.t_task_team[task_state];
3353  th->th.th_task_state = task_state;
3354}
3355
3356/* 2.a.i. Reduce Block without a terminating barrier */
3357/*!
3358@ingroup SYNCHRONIZATION
3359@param loc source location information
3360@param global_tid global thread number
3361@param num_vars number of items (variables) to be reduced
3362@param reduce_size size of data in bytes to be reduced
3363@param reduce_data pointer to data to be reduced
3364@param reduce_func callback function providing reduction operation on two
3365operands and returning result of reduction in lhs_data
3366@param lck pointer to the unique lock data structure
3367@result 1 for the master thread, 0 for all other team threads, 2 for all team
3368threads if atomic reduction needed
3369
3370The nowait version is used for a reduce clause with the nowait argument.
3371*/
3372kmp_int32
3373__kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3374                     size_t reduce_size, void *reduce_data,
3375                     void (*reduce_func)(void *lhs_data, void *rhs_data),
3376                     kmp_critical_name *lck) {
3377
3378  KMP_COUNT_BLOCK(REDUCE_nowait);
3379  int retval = 0;
3380  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3381  kmp_info_t *th;
3382  kmp_team_t *team;
3383  int teams_swapped = 0, task_state;
3384  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3385
3386  // why do we need this initialization here at all?
3387  // Reduction clause can not be used as a stand-alone directive.
3388
3389  // do not call __kmp_serial_initialize(), it will be called by
3390  // __kmp_parallel_initialize() if needed
3391  // possible detection of false-positive race by the threadchecker ???
3392  if (!TCR_4(__kmp_init_parallel))
3393    __kmp_parallel_initialize();
3394
3395  __kmp_resume_if_soft_paused();
3396
3397// check correctness of reduce block nesting
3398#if KMP_USE_DYNAMIC_LOCK
3399  if (__kmp_env_consistency_check)
3400    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3401#else
3402  if (__kmp_env_consistency_check)
3403    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3404#endif
3405
3406  th = __kmp_thread_from_gtid(global_tid);
3407  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3408
3409  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3410  // the value should be kept in a variable
3411  // the variable should be either a construct-specific or thread-specific
3412  // property, not a team specific property
3413  //     (a thread can reach the next reduce block on the next construct, reduce
3414  //     method may differ on the next construct)
3415  // an ident_t "loc" parameter could be used as a construct-specific property
3416  // (what if loc == 0?)
3417  //     (if both construct-specific and team-specific variables were shared,
3418  //     then unness extra syncs should be needed)
3419  // a thread-specific variable is better regarding two issues above (next
3420  // construct and extra syncs)
3421  // a thread-specific "th_local.reduction_method" variable is used currently
3422  // each thread executes 'determine' and 'set' lines (no need to execute by one
3423  // thread, to avoid unness extra syncs)
3424
3425  packed_reduction_method = __kmp_determine_reduction_method(
3426      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3427  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3428
3429  OMPT_REDUCTION_DECL(th, global_tid);
3430  if (packed_reduction_method == critical_reduce_block) {
3431
3432    OMPT_REDUCTION_BEGIN;
3433
3434    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3435    retval = 1;
3436
3437  } else if (packed_reduction_method == empty_reduce_block) {
3438
3439    OMPT_REDUCTION_BEGIN;
3440
3441    // usage: if team size == 1, no synchronization is required ( Intel
3442    // platforms only )
3443    retval = 1;
3444
3445  } else if (packed_reduction_method == atomic_reduce_block) {
3446
3447    retval = 2;
3448
3449    // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3450    // won't be called by the code gen)
3451    //     (it's not quite good, because the checking block has been closed by
3452    //     this 'pop',
3453    //      but atomic operation has not been executed yet, will be executed
3454    //      slightly later, literally on next instruction)
3455    if (__kmp_env_consistency_check)
3456      __kmp_pop_sync(global_tid, ct_reduce, loc);
3457
3458  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3459                                   tree_reduce_block)) {
3460
3461// AT: performance issue: a real barrier here
3462// AT:     (if master goes slow, other threads are blocked here waiting for the
3463// master to come and release them)
3464// AT:     (it's not what a customer might expect specifying NOWAIT clause)
3465// AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3466// be confusing to a customer)
3467// AT: another implementation of *barrier_gather*nowait() (or some other design)
3468// might go faster and be more in line with sense of NOWAIT
3469// AT: TO DO: do epcc test and compare times
3470
3471// this barrier should be invisible to a customer and to the threading profile
3472// tool (it's neither a terminating barrier nor customer's code, it's
3473// used for an internal purpose)
3474#if OMPT_SUPPORT
3475    // JP: can this barrier potentially leed to task scheduling?
3476    // JP: as long as there is a barrier in the implementation, OMPT should and
3477    // will provide the barrier events
3478    //         so we set-up the necessary frame/return addresses.
3479    ompt_frame_t *ompt_frame;
3480    if (ompt_enabled.enabled) {
3481      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3482      if (ompt_frame->enter_frame.ptr == NULL)
3483        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3484      OMPT_STORE_RETURN_ADDRESS(global_tid);
3485    }
3486#endif
3487#if USE_ITT_NOTIFY
3488    __kmp_threads[global_tid]->th.th_ident = loc;
3489#endif
3490    retval =
3491        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3492                      global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3493    retval = (retval != 0) ? (0) : (1);
3494#if OMPT_SUPPORT && OMPT_OPTIONAL
3495    if (ompt_enabled.enabled) {
3496      ompt_frame->enter_frame = ompt_data_none;
3497    }
3498#endif
3499
3500    // all other workers except master should do this pop here
3501    //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3502    if (__kmp_env_consistency_check) {
3503      if (retval == 0) {
3504        __kmp_pop_sync(global_tid, ct_reduce, loc);
3505      }
3506    }
3507
3508  } else {
3509
3510    // should never reach this block
3511    KMP_ASSERT(0); // "unexpected method"
3512  }
3513  if (teams_swapped) {
3514    __kmp_restore_swapped_teams(th, team, task_state);
3515  }
3516  KA_TRACE(
3517      10,
3518      ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3519       global_tid, packed_reduction_method, retval));
3520
3521  return retval;
3522}
3523
3524/*!
3525@ingroup SYNCHRONIZATION
3526@param loc source location information
3527@param global_tid global thread id.
3528@param lck pointer to the unique lock data structure
3529
3530Finish the execution of a reduce nowait.
3531*/
3532void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3533                              kmp_critical_name *lck) {
3534
3535  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3536
3537  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3538
3539  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3540
3541  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3542
3543  if (packed_reduction_method == critical_reduce_block) {
3544
3545    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3546    OMPT_REDUCTION_END;
3547
3548  } else if (packed_reduction_method == empty_reduce_block) {
3549
3550    // usage: if team size == 1, no synchronization is required ( on Intel
3551    // platforms only )
3552
3553    OMPT_REDUCTION_END;
3554
3555  } else if (packed_reduction_method == atomic_reduce_block) {
3556
3557    // neither master nor other workers should get here
3558    //     (code gen does not generate this call in case 2: atomic reduce block)
3559    // actually it's better to remove this elseif at all;
3560    // after removal this value will checked by the 'else' and will assert
3561
3562  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3563                                   tree_reduce_block)) {
3564
3565    // only master gets here
3566    // OMPT: tree reduction is annotated in the barrier code
3567
3568  } else {
3569
3570    // should never reach this block
3571    KMP_ASSERT(0); // "unexpected method"
3572  }
3573
3574  if (__kmp_env_consistency_check)
3575    __kmp_pop_sync(global_tid, ct_reduce, loc);
3576
3577  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3578                global_tid, packed_reduction_method));
3579
3580  return;
3581}
3582
3583/* 2.a.ii. Reduce Block with a terminating barrier */
3584
3585/*!
3586@ingroup SYNCHRONIZATION
3587@param loc source location information
3588@param global_tid global thread number
3589@param num_vars number of items (variables) to be reduced
3590@param reduce_size size of data in bytes to be reduced
3591@param reduce_data pointer to data to be reduced
3592@param reduce_func callback function providing reduction operation on two
3593operands and returning result of reduction in lhs_data
3594@param lck pointer to the unique lock data structure
3595@result 1 for the master thread, 0 for all other team threads, 2 for all team
3596threads if atomic reduction needed
3597
3598A blocking reduce that includes an implicit barrier.
3599*/
3600kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3601                        size_t reduce_size, void *reduce_data,
3602                        void (*reduce_func)(void *lhs_data, void *rhs_data),
3603                        kmp_critical_name *lck) {
3604  KMP_COUNT_BLOCK(REDUCE_wait);
3605  int retval = 0;
3606  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3607  kmp_info_t *th;
3608  kmp_team_t *team;
3609  int teams_swapped = 0, task_state;
3610
3611  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3612
3613  // why do we need this initialization here at all?
3614  // Reduction clause can not be a stand-alone directive.
3615
3616  // do not call __kmp_serial_initialize(), it will be called by
3617  // __kmp_parallel_initialize() if needed
3618  // possible detection of false-positive race by the threadchecker ???
3619  if (!TCR_4(__kmp_init_parallel))
3620    __kmp_parallel_initialize();
3621
3622  __kmp_resume_if_soft_paused();
3623
3624// check correctness of reduce block nesting
3625#if KMP_USE_DYNAMIC_LOCK
3626  if (__kmp_env_consistency_check)
3627    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3628#else
3629  if (__kmp_env_consistency_check)
3630    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3631#endif
3632
3633  th = __kmp_thread_from_gtid(global_tid);
3634  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3635
3636  packed_reduction_method = __kmp_determine_reduction_method(
3637      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3638  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3639
3640  OMPT_REDUCTION_DECL(th, global_tid);
3641
3642  if (packed_reduction_method == critical_reduce_block) {
3643
3644    OMPT_REDUCTION_BEGIN;
3645    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3646    retval = 1;
3647
3648  } else if (packed_reduction_method == empty_reduce_block) {
3649
3650    OMPT_REDUCTION_BEGIN;
3651    // usage: if team size == 1, no synchronization is required ( Intel
3652    // platforms only )
3653    retval = 1;
3654
3655  } else if (packed_reduction_method == atomic_reduce_block) {
3656
3657    retval = 2;
3658
3659  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3660                                   tree_reduce_block)) {
3661
3662// case tree_reduce_block:
3663// this barrier should be visible to a customer and to the threading profile
3664// tool (it's a terminating barrier on constructs if NOWAIT not specified)
3665#if OMPT_SUPPORT
3666    ompt_frame_t *ompt_frame;
3667    if (ompt_enabled.enabled) {
3668      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3669      if (ompt_frame->enter_frame.ptr == NULL)
3670        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3671      OMPT_STORE_RETURN_ADDRESS(global_tid);
3672    }
3673#endif
3674#if USE_ITT_NOTIFY
3675    __kmp_threads[global_tid]->th.th_ident =
3676        loc; // needed for correct notification of frames
3677#endif
3678    retval =
3679        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3680                      global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3681    retval = (retval != 0) ? (0) : (1);
3682#if OMPT_SUPPORT && OMPT_OPTIONAL
3683    if (ompt_enabled.enabled) {
3684      ompt_frame->enter_frame = ompt_data_none;
3685    }
3686#endif
3687
3688    // all other workers except master should do this pop here
3689    // ( none of other workers except master will enter __kmpc_end_reduce() )
3690    if (__kmp_env_consistency_check) {
3691      if (retval == 0) { // 0: all other workers; 1: master
3692        __kmp_pop_sync(global_tid, ct_reduce, loc);
3693      }
3694    }
3695
3696  } else {
3697
3698    // should never reach this block
3699    KMP_ASSERT(0); // "unexpected method"
3700  }
3701  if (teams_swapped) {
3702    __kmp_restore_swapped_teams(th, team, task_state);
3703  }
3704
3705  KA_TRACE(10,
3706           ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3707            global_tid, packed_reduction_method, retval));
3708  return retval;
3709}
3710
3711/*!
3712@ingroup SYNCHRONIZATION
3713@param loc source location information
3714@param global_tid global thread id.
3715@param lck pointer to the unique lock data structure
3716
3717Finish the execution of a blocking reduce.
3718The <tt>lck</tt> pointer must be the same as that used in the corresponding
3719start function.
3720*/
3721void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3722                       kmp_critical_name *lck) {
3723
3724  PACKED_REDUCTION_METHOD_T packed_reduction_method;
3725  kmp_info_t *th;
3726  kmp_team_t *team;
3727  int teams_swapped = 0, task_state;
3728
3729  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3730
3731  th = __kmp_thread_from_gtid(global_tid);
3732  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3733
3734  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3735
3736  // this barrier should be visible to a customer and to the threading profile
3737  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3738  OMPT_REDUCTION_DECL(th, global_tid);
3739
3740  if (packed_reduction_method == critical_reduce_block) {
3741    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3742
3743    OMPT_REDUCTION_END;
3744
3745// TODO: implicit barrier: should be exposed
3746#if OMPT_SUPPORT
3747    ompt_frame_t *ompt_frame;
3748    if (ompt_enabled.enabled) {
3749      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3750      if (ompt_frame->enter_frame.ptr == NULL)
3751        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3752      OMPT_STORE_RETURN_ADDRESS(global_tid);
3753    }
3754#endif
3755#if USE_ITT_NOTIFY
3756    __kmp_threads[global_tid]->th.th_ident = loc;
3757#endif
3758    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3759#if OMPT_SUPPORT && OMPT_OPTIONAL
3760    if (ompt_enabled.enabled) {
3761      ompt_frame->enter_frame = ompt_data_none;
3762    }
3763#endif
3764
3765  } else if (packed_reduction_method == empty_reduce_block) {
3766
3767    OMPT_REDUCTION_END;
3768
3769// usage: if team size==1, no synchronization is required (Intel platforms only)
3770
3771// TODO: implicit barrier: should be exposed
3772#if OMPT_SUPPORT
3773    ompt_frame_t *ompt_frame;
3774    if (ompt_enabled.enabled) {
3775      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3776      if (ompt_frame->enter_frame.ptr == NULL)
3777        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3778      OMPT_STORE_RETURN_ADDRESS(global_tid);
3779    }
3780#endif
3781#if USE_ITT_NOTIFY
3782    __kmp_threads[global_tid]->th.th_ident = loc;
3783#endif
3784    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3785#if OMPT_SUPPORT && OMPT_OPTIONAL
3786    if (ompt_enabled.enabled) {
3787      ompt_frame->enter_frame = ompt_data_none;
3788    }
3789#endif
3790
3791  } else if (packed_reduction_method == atomic_reduce_block) {
3792
3793#if OMPT_SUPPORT
3794    ompt_frame_t *ompt_frame;
3795    if (ompt_enabled.enabled) {
3796      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3797      if (ompt_frame->enter_frame.ptr == NULL)
3798        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3799      OMPT_STORE_RETURN_ADDRESS(global_tid);
3800    }
3801#endif
3802// TODO: implicit barrier: should be exposed
3803#if USE_ITT_NOTIFY
3804    __kmp_threads[global_tid]->th.th_ident = loc;
3805#endif
3806    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3807#if OMPT_SUPPORT && OMPT_OPTIONAL
3808    if (ompt_enabled.enabled) {
3809      ompt_frame->enter_frame = ompt_data_none;
3810    }
3811#endif
3812
3813  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3814                                   tree_reduce_block)) {
3815
3816    // only master executes here (master releases all other workers)
3817    __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3818                            global_tid);
3819
3820  } else {
3821
3822    // should never reach this block
3823    KMP_ASSERT(0); // "unexpected method"
3824  }
3825  if (teams_swapped) {
3826    __kmp_restore_swapped_teams(th, team, task_state);
3827  }
3828
3829  if (__kmp_env_consistency_check)
3830    __kmp_pop_sync(global_tid, ct_reduce, loc);
3831
3832  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3833                global_tid, packed_reduction_method));
3834
3835  return;
3836}
3837
3838#undef __KMP_GET_REDUCTION_METHOD
3839#undef __KMP_SET_REDUCTION_METHOD
3840
3841/* end of interface to fast scalable reduce routines */
3842
3843kmp_uint64 __kmpc_get_taskid() {
3844
3845  kmp_int32 gtid;
3846  kmp_info_t *thread;
3847
3848  gtid = __kmp_get_gtid();
3849  if (gtid < 0) {
3850    return 0;
3851  }
3852  thread = __kmp_thread_from_gtid(gtid);
3853  return thread->th.th_current_task->td_task_id;
3854
3855} // __kmpc_get_taskid
3856
3857kmp_uint64 __kmpc_get_parent_taskid() {
3858
3859  kmp_int32 gtid;
3860  kmp_info_t *thread;
3861  kmp_taskdata_t *parent_task;
3862
3863  gtid = __kmp_get_gtid();
3864  if (gtid < 0) {
3865    return 0;
3866  }
3867  thread = __kmp_thread_from_gtid(gtid);
3868  parent_task = thread->th.th_current_task->td_parent;
3869  return (parent_task == NULL ? 0 : parent_task->td_task_id);
3870
3871} // __kmpc_get_parent_taskid
3872
3873/*!
3874@ingroup WORK_SHARING
3875@param loc  source location information.
3876@param gtid  global thread number.
3877@param num_dims  number of associated doacross loops.
3878@param dims  info on loops bounds.
3879
3880Initialize doacross loop information.
3881Expect compiler send us inclusive bounds,
3882e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3883*/
3884void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3885                          const struct kmp_dim *dims) {
3886  int j, idx;
3887  kmp_int64 last, trace_count;
3888  kmp_info_t *th = __kmp_threads[gtid];
3889  kmp_team_t *team = th->th.th_team;
3890  kmp_uint32 *flags;
3891  kmp_disp_t *pr_buf = th->th.th_dispatch;
3892  dispatch_shared_info_t *sh_buf;
3893
3894  KA_TRACE(
3895      20,
3896      ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3897       gtid, num_dims, !team->t.t_serialized));
3898  KMP_DEBUG_ASSERT(dims != NULL);
3899  KMP_DEBUG_ASSERT(num_dims > 0);
3900
3901  if (team->t.t_serialized) {
3902    KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3903    return; // no dependencies if team is serialized
3904  }
3905  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3906  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3907  // the next loop
3908  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3909
3910  // Save bounds info into allocated private buffer
3911  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3912  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3913      th, sizeof(kmp_int64) * (4 * num_dims + 1));
3914  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3915  pr_buf->th_doacross_info[0] =
3916      (kmp_int64)num_dims; // first element is number of dimensions
3917  // Save also address of num_done in order to access it later without knowing
3918  // the buffer index
3919  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3920  pr_buf->th_doacross_info[2] = dims[0].lo;
3921  pr_buf->th_doacross_info[3] = dims[0].up;
3922  pr_buf->th_doacross_info[4] = dims[0].st;
3923  last = 5;
3924  for (j = 1; j < num_dims; ++j) {
3925    kmp_int64
3926        range_length; // To keep ranges of all dimensions but the first dims[0]
3927    if (dims[j].st == 1) { // most common case
3928      // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3929      range_length = dims[j].up - dims[j].lo + 1;
3930    } else {
3931      if (dims[j].st > 0) {
3932        KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3933        range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3934      } else { // negative increment
3935        KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3936        range_length =
3937            (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3938      }
3939    }
3940    pr_buf->th_doacross_info[last++] = range_length;
3941    pr_buf->th_doacross_info[last++] = dims[j].lo;
3942    pr_buf->th_doacross_info[last++] = dims[j].up;
3943    pr_buf->th_doacross_info[last++] = dims[j].st;
3944  }
3945
3946  // Compute total trip count.
3947  // Start with range of dims[0] which we don't need to keep in the buffer.
3948  if (dims[0].st == 1) { // most common case
3949    trace_count = dims[0].up - dims[0].lo + 1;
3950  } else if (dims[0].st > 0) {
3951    KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3952    trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3953  } else { // negative increment
3954    KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3955    trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3956  }
3957  for (j = 1; j < num_dims; ++j) {
3958    trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3959  }
3960  KMP_DEBUG_ASSERT(trace_count > 0);
3961
3962  // Check if shared buffer is not occupied by other loop (idx -
3963  // __kmp_dispatch_num_buffers)
3964  if (idx != sh_buf->doacross_buf_idx) {
3965    // Shared buffer is occupied, wait for it to be free
3966    __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3967                 __kmp_eq_4, NULL);
3968  }
3969#if KMP_32_BIT_ARCH
3970  // Check if we are the first thread. After the CAS the first thread gets 0,
3971  // others get 1 if initialization is in progress, allocated pointer otherwise.
3972  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3973  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3974      (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3975#else
3976  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3977      (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3978#endif
3979  if (flags == NULL) {
3980    // we are the first thread, allocate the array of flags
3981    size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3982    flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3983    KMP_MB();
3984    sh_buf->doacross_flags = flags;
3985  } else if (flags == (kmp_uint32 *)1) {
3986#if KMP_32_BIT_ARCH
3987    // initialization is still in progress, need to wait
3988    while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3989#else
3990    while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3991#endif
3992      KMP_YIELD(TRUE);
3993    KMP_MB();
3994  } else {
3995    KMP_MB();
3996  }
3997  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3998  pr_buf->th_doacross_flags =
3999      sh_buf->doacross_flags; // save private copy in order to not
4000  // touch shared buffer on each iteration
4001  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4002}
4003
4004void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4005  kmp_int32 shft, num_dims, i;
4006  kmp_uint32 flag;
4007  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4008  kmp_info_t *th = __kmp_threads[gtid];
4009  kmp_team_t *team = th->th.th_team;
4010  kmp_disp_t *pr_buf;
4011  kmp_int64 lo, up, st;
4012
4013  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4014  if (team->t.t_serialized) {
4015    KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4016    return; // no dependencies if team is serialized
4017  }
4018
4019  // calculate sequential iteration number and check out-of-bounds condition
4020  pr_buf = th->th.th_dispatch;
4021  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4022  num_dims = pr_buf->th_doacross_info[0];
4023  lo = pr_buf->th_doacross_info[2];
4024  up = pr_buf->th_doacross_info[3];
4025  st = pr_buf->th_doacross_info[4];
4026  if (st == 1) { // most common case
4027    if (vec[0] < lo || vec[0] > up) {
4028      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4029                    "bounds [%lld,%lld]\n",
4030                    gtid, vec[0], lo, up));
4031      return;
4032    }
4033    iter_number = vec[0] - lo;
4034  } else if (st > 0) {
4035    if (vec[0] < lo || vec[0] > up) {
4036      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4037                    "bounds [%lld,%lld]\n",
4038                    gtid, vec[0], lo, up));
4039      return;
4040    }
4041    iter_number = (kmp_uint64)(vec[0] - lo) / st;
4042  } else { // negative increment
4043    if (vec[0] > lo || vec[0] < up) {
4044      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4045                    "bounds [%lld,%lld]\n",
4046                    gtid, vec[0], lo, up));
4047      return;
4048    }
4049    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4050  }
4051  for (i = 1; i < num_dims; ++i) {
4052    kmp_int64 iter, ln;
4053    kmp_int32 j = i * 4;
4054    ln = pr_buf->th_doacross_info[j + 1];
4055    lo = pr_buf->th_doacross_info[j + 2];
4056    up = pr_buf->th_doacross_info[j + 3];
4057    st = pr_buf->th_doacross_info[j + 4];
4058    if (st == 1) {
4059      if (vec[i] < lo || vec[i] > up) {
4060        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4061                      "bounds [%lld,%lld]\n",
4062                      gtid, vec[i], lo, up));
4063        return;
4064      }
4065      iter = vec[i] - lo;
4066    } else if (st > 0) {
4067      if (vec[i] < lo || vec[i] > up) {
4068        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4069                      "bounds [%lld,%lld]\n",
4070                      gtid, vec[i], lo, up));
4071        return;
4072      }
4073      iter = (kmp_uint64)(vec[i] - lo) / st;
4074    } else { // st < 0
4075      if (vec[i] > lo || vec[i] < up) {
4076        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4077                      "bounds [%lld,%lld]\n",
4078                      gtid, vec[i], lo, up));
4079        return;
4080      }
4081      iter = (kmp_uint64)(lo - vec[i]) / (-st);
4082    }
4083    iter_number = iter + ln * iter_number;
4084  }
4085  shft = iter_number % 32; // use 32-bit granularity
4086  iter_number >>= 5; // divided by 32
4087  flag = 1 << shft;
4088  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4089    KMP_YIELD(TRUE);
4090  }
4091  KMP_MB();
4092  KA_TRACE(20,
4093           ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4094            gtid, (iter_number << 5) + shft));
4095}
4096
4097void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4098  kmp_int32 shft, num_dims, i;
4099  kmp_uint32 flag;
4100  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4101  kmp_info_t *th = __kmp_threads[gtid];
4102  kmp_team_t *team = th->th.th_team;
4103  kmp_disp_t *pr_buf;
4104  kmp_int64 lo, st;
4105
4106  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4107  if (team->t.t_serialized) {
4108    KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4109    return; // no dependencies if team is serialized
4110  }
4111
4112  // calculate sequential iteration number (same as in "wait" but no
4113  // out-of-bounds checks)
4114  pr_buf = th->th.th_dispatch;
4115  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4116  num_dims = pr_buf->th_doacross_info[0];
4117  lo = pr_buf->th_doacross_info[2];
4118  st = pr_buf->th_doacross_info[4];
4119  if (st == 1) { // most common case
4120    iter_number = vec[0] - lo;
4121  } else if (st > 0) {
4122    iter_number = (kmp_uint64)(vec[0] - lo) / st;
4123  } else { // negative increment
4124    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4125  }
4126  for (i = 1; i < num_dims; ++i) {
4127    kmp_int64 iter, ln;
4128    kmp_int32 j = i * 4;
4129    ln = pr_buf->th_doacross_info[j + 1];
4130    lo = pr_buf->th_doacross_info[j + 2];
4131    st = pr_buf->th_doacross_info[j + 4];
4132    if (st == 1) {
4133      iter = vec[i] - lo;
4134    } else if (st > 0) {
4135      iter = (kmp_uint64)(vec[i] - lo) / st;
4136    } else { // st < 0
4137      iter = (kmp_uint64)(lo - vec[i]) / (-st);
4138    }
4139    iter_number = iter + ln * iter_number;
4140  }
4141  shft = iter_number % 32; // use 32-bit granularity
4142  iter_number >>= 5; // divided by 32
4143  flag = 1 << shft;
4144  KMP_MB();
4145  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4146    KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4147  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4148                (iter_number << 5) + shft));
4149}
4150
4151void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4152  kmp_int32 num_done;
4153  kmp_info_t *th = __kmp_threads[gtid];
4154  kmp_team_t *team = th->th.th_team;
4155  kmp_disp_t *pr_buf = th->th.th_dispatch;
4156
4157  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4158  if (team->t.t_serialized) {
4159    KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4160    return; // nothing to do
4161  }
4162  num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4163  if (num_done == th->th.th_team_nproc) {
4164    // we are the last thread, need to free shared resources
4165    int idx = pr_buf->th_doacross_buf_idx - 1;
4166    dispatch_shared_info_t *sh_buf =
4167        &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4168    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4169                     (kmp_int64)&sh_buf->doacross_num_done);
4170    KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4171    KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4172    __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4173    sh_buf->doacross_flags = NULL;
4174    sh_buf->doacross_num_done = 0;
4175    sh_buf->doacross_buf_idx +=
4176        __kmp_dispatch_num_buffers; // free buffer for future re-use
4177  }
4178  // free private resources (need to keep buffer index forever)
4179  pr_buf->th_doacross_flags = NULL;
4180  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4181  pr_buf->th_doacross_info = NULL;
4182  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4183}
4184
4185/* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4186void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4187  return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4188}
4189
4190void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4191  __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4192}
4193
4194int __kmpc_get_target_offload(void) {
4195  if (!__kmp_init_serial) {
4196    __kmp_serial_initialize();
4197  }
4198  return __kmp_target_offload;
4199}
4200
4201int __kmpc_pause_resource(kmp_pause_status_t level) {
4202  if (!__kmp_init_serial) {
4203    return 1; // Can't pause if runtime is not initialized
4204  }
4205  return __kmp_pause_resource(level);
4206}
4207