1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34
35/* these are temporary issues to be dealt with */
36#define KMP_USE_PRCTL 0
37
38#if KMP_OS_WINDOWS
39#include <process.h>
40#endif
41
42#include "tsan_annotations.h"
43
44#if defined(KMP_GOMP_COMPAT)
45char const __kmp_version_alt_comp[] =
46    KMP_VERSION_PREFIX "alternative compiler support: yes";
47#endif /* defined(KMP_GOMP_COMPAT) */
48
49char const __kmp_version_omp_api[] =
50    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51
52#ifdef KMP_DEBUG
53char const __kmp_version_lock[] =
54    KMP_VERSION_PREFIX "lock type: run time selectable";
55#endif /* KMP_DEBUG */
56
57#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58
59/* ------------------------------------------------------------------------ */
60
61#if KMP_USE_MONITOR
62kmp_info_t __kmp_monitor;
63#endif
64
65/* Forward declarations */
66
67void __kmp_cleanup(void);
68
69static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70                                  int gtid);
71static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72                                  kmp_internal_control_t *new_icvs,
73                                  ident_t *loc);
74#if KMP_AFFINITY_SUPPORTED
75static void __kmp_partition_places(kmp_team_t *team,
76                                   int update_master_only = 0);
77#endif
78static void __kmp_do_serial_initialize(void);
79void __kmp_fork_barrier(int gtid, int tid);
80void __kmp_join_barrier(int gtid);
81void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82                          kmp_internal_control_t *new_icvs, ident_t *loc);
83
84#ifdef USE_LOAD_BALANCE
85static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86#endif
87
88static int __kmp_expand_threads(int nNeed);
89#if KMP_OS_WINDOWS
90static int __kmp_unregister_root_other_thread(int gtid);
91#endif
92static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95
96/* Calculate the identifier of the current thread */
97/* fast (and somewhat portable) way to get unique identifier of executing
98   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99int __kmp_get_global_thread_id() {
100  int i;
101  kmp_info_t **other_threads;
102  size_t stack_data;
103  char *stack_addr;
104  size_t stack_size;
105  char *stack_base;
106
107  KA_TRACE(
108      1000,
109      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
110       __kmp_nth, __kmp_all_nth));
111
112  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115     __kmp_init_gtid for this to work. */
116
117  if (!TCR_4(__kmp_init_gtid))
118    return KMP_GTID_DNE;
119
120#ifdef KMP_TDATA_GTID
121  if (TCR_4(__kmp_gtid_mode) >= 3) {
122    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123    return __kmp_gtid;
124  }
125#endif
126  if (TCR_4(__kmp_gtid_mode) >= 2) {
127    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128    return __kmp_gtid_get_specific();
129  }
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131
132  stack_addr = (char *)&stack_data;
133  other_threads = __kmp_threads;
134
135  /* ATT: The code below is a source of potential bugs due to unsynchronized
136     access to __kmp_threads array. For example:
137     1. Current thread loads other_threads[i] to thr and checks it, it is
138        non-NULL.
139     2. Current thread is suspended by OS.
140     3. Another thread unregisters and finishes (debug versions of free()
141        may fill memory with something like 0xEF).
142     4. Current thread is resumed.
143     5. Current thread reads junk from *thr.
144     TODO: Fix it.  --ln  */
145
146  for (i = 0; i < __kmp_threads_capacity; i++) {
147
148    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149    if (!thr)
150      continue;
151
152    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154
155    /* stack grows down -- search through all of the active threads */
156
157    if (stack_addr <= stack_base) {
158      size_t stack_diff = stack_base - stack_addr;
159
160      if (stack_diff <= stack_size) {
161        /* The only way we can be closer than the allocated */
162        /* stack size is if we are running on this thread. */
163        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164        return i;
165      }
166    }
167  }
168
169  /* get specific to try and determine our gtid */
170  KA_TRACE(1000,
171           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172            "thread, using TLS\n"));
173  i = __kmp_gtid_get_specific();
174
175  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
176
177  /* if we havn't been assigned a gtid, then return code */
178  if (i < 0)
179    return i;
180
181  /* dynamically updated stack window for uber threads to avoid get_specific
182     call */
183  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184    KMP_FATAL(StackOverflow, i);
185  }
186
187  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188  if (stack_addr > stack_base) {
189    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192                stack_base);
193  } else {
194    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195            stack_base - stack_addr);
196  }
197
198  /* Reprint stack bounds for ubermaster since they have been refined */
199  if (__kmp_storage_map) {
200    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203                                 other_threads[i]->th.th_info.ds.ds_stacksize,
204                                 "th_%d stack (refinement)", i);
205  }
206  return i;
207}
208
209int __kmp_get_global_thread_id_reg() {
210  int gtid;
211
212  if (!__kmp_init_serial) {
213    gtid = KMP_GTID_DNE;
214  } else
215#ifdef KMP_TDATA_GTID
216      if (TCR_4(__kmp_gtid_mode) >= 3) {
217    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218    gtid = __kmp_gtid;
219  } else
220#endif
221      if (TCR_4(__kmp_gtid_mode) >= 2) {
222    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223    gtid = __kmp_gtid_get_specific();
224  } else {
225    KA_TRACE(1000,
226             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227    gtid = __kmp_get_global_thread_id();
228  }
229
230  /* we must be a new uber master sibling thread */
231  if (gtid == KMP_GTID_DNE) {
232    KA_TRACE(10,
233             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234              "Registering a new gtid.\n"));
235    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236    if (!__kmp_init_serial) {
237      __kmp_do_serial_initialize();
238      gtid = __kmp_gtid_get_specific();
239    } else {
240      gtid = __kmp_register_root(FALSE);
241    }
242    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244  }
245
246  KMP_DEBUG_ASSERT(gtid >= 0);
247
248  return gtid;
249}
250
251/* caller must hold forkjoin_lock */
252void __kmp_check_stack_overlap(kmp_info_t *th) {
253  int f;
254  char *stack_beg = NULL;
255  char *stack_end = NULL;
256  int gtid;
257
258  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259  if (__kmp_storage_map) {
260    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262
263    gtid = __kmp_gtid_from_thread(th);
264
265    if (gtid == KMP_GTID_MONITOR) {
266      __kmp_print_storage_map_gtid(
267          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268          "th_%s stack (%s)", "mon",
269          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270    } else {
271      __kmp_print_storage_map_gtid(
272          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273          "th_%d stack (%s)", gtid,
274          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275    }
276  }
277
278  /* No point in checking ubermaster threads since they use refinement and
279   * cannot overlap */
280  gtid = __kmp_gtid_from_thread(th);
281  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282    KA_TRACE(10,
283             ("__kmp_check_stack_overlap: performing extensive checking\n"));
284    if (stack_beg == NULL) {
285      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287    }
288
289    for (f = 0; f < __kmp_threads_capacity; f++) {
290      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291
292      if (f_th && f_th != th) {
293        char *other_stack_end =
294            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295        char *other_stack_beg =
296            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299
300          /* Print the other stack values before the abort */
301          if (__kmp_storage_map)
302            __kmp_print_storage_map_gtid(
303                -1, other_stack_beg, other_stack_end,
304                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306
307          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308                      __kmp_msg_null);
309        }
310      }
311    }
312  }
313  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314}
315
316/* ------------------------------------------------------------------------ */
317
318void __kmp_infinite_loop(void) {
319  static int done = FALSE;
320
321  while (!done) {
322    KMP_YIELD(TRUE);
323  }
324}
325
326#define MAX_MESSAGE 512
327
328void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329                                  char const *format, ...) {
330  char buffer[MAX_MESSAGE];
331  va_list ap;
332
333  va_start(ap, format);
334  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335               p2, (unsigned long)size, format);
336  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337  __kmp_vprintf(kmp_err, buffer, ap);
338#if KMP_PRINT_DATA_PLACEMENT
339  int node;
340  if (gtid >= 0) {
341    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342      if (__kmp_storage_map_verbose) {
343        node = __kmp_get_host_node(p1);
344        if (node < 0) /* doesn't work, so don't try this next time */
345          __kmp_storage_map_verbose = FALSE;
346        else {
347          char *last;
348          int lastNode;
349          int localProc = __kmp_get_cpu_from_gtid(gtid);
350
351          const int page_size = KMP_GET_PAGE_SIZE();
352
353          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355          if (localProc >= 0)
356            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
357                                 localProc >> 1);
358          else
359            __kmp_printf_no_lock("  GTID %d\n", gtid);
360#if KMP_USE_PRCTL
361          /* The more elaborate format is disabled for now because of the prctl
362           * hanging bug. */
363          do {
364            last = p1;
365            lastNode = node;
366            /* This loop collates adjacent pages with the same host node. */
367            do {
368              (char *)p1 += page_size;
369            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
371                                 lastNode);
372          } while (p1 <= p2);
373#else
374          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
375                               (char *)p1 + (page_size - 1),
376                               __kmp_get_host_node(p1));
377          if (p1 < p2) {
378            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                 (char *)p2 + (page_size - 1),
380                                 __kmp_get_host_node(p2));
381          }
382#endif
383        }
384      }
385    } else
386      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
387  }
388#endif /* KMP_PRINT_DATA_PLACEMENT */
389  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390}
391
392void __kmp_warn(char const *format, ...) {
393  char buffer[MAX_MESSAGE];
394  va_list ap;
395
396  if (__kmp_generate_warnings == kmp_warnings_off) {
397    return;
398  }
399
400  va_start(ap, format);
401
402  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404  __kmp_vprintf(kmp_err, buffer, ap);
405  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406
407  va_end(ap);
408}
409
410void __kmp_abort_process() {
411  // Later threads may stall here, but that's ok because abort() will kill them.
412  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413
414  if (__kmp_debug_buf) {
415    __kmp_dump_debug_buffer();
416  }
417
418  if (KMP_OS_WINDOWS) {
419    // Let other threads know of abnormal termination and prevent deadlock
420    // if abort happened during library initialization or shutdown
421    __kmp_global.g.g_abort = SIGABRT;
422
423    /* On Windows* OS by default abort() causes pop-up error box, which stalls
424       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425       boxes. _set_abort_behavior() works well, but this function is not
426       available in VS7 (this is not problem for DLL, but it is a problem for
427       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428       help, at least in some versions of MS C RTL.
429
430       It seems following sequence is the only way to simulate abort() and
431       avoid pop-up error box. */
432    raise(SIGABRT);
433    _exit(3); // Just in case, if signal ignored, exit anyway.
434  } else {
435    abort();
436  }
437
438  __kmp_infinite_loop();
439  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440
441} // __kmp_abort_process
442
443void __kmp_abort_thread(void) {
444  // TODO: Eliminate g_abort global variable and this function.
445  // In case of abort just call abort(), it will kill all the threads.
446  __kmp_infinite_loop();
447} // __kmp_abort_thread
448
449/* Print out the storage map for the major kmp_info_t thread data structures
450   that are allocated together. */
451
452static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454                               gtid);
455
456  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
461
462  __kmp_print_storage_map_gtid(
463      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465
466  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467                               &thr->th.th_bar[bs_plain_barrier + 1],
468                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469                               gtid);
470
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
473                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474                               gtid);
475
476#if KMP_FAST_REDUCTION_BARRIER
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478                               &thr->th.th_bar[bs_reduction_barrier + 1],
479                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480                               gtid);
481#endif // KMP_FAST_REDUCTION_BARRIER
482}
483
484/* Print out the storage map for the major kmp_team_t team data structures
485   that are allocated together. */
486
487static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488                                         int team_id, int num_thr) {
489  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491                               header, team_id);
492
493  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494                               &team->t.t_bar[bs_last_barrier],
495                               sizeof(kmp_balign_team_t) * bs_last_barrier,
496                               "%s_%d.t_bar", header, team_id);
497
498  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499                               &team->t.t_bar[bs_plain_barrier + 1],
500                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501                               header, team_id);
502
503  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504                               &team->t.t_bar[bs_forkjoin_barrier + 1],
505                               sizeof(kmp_balign_team_t),
506                               "%s_%d.t_bar[forkjoin]", header, team_id);
507
508#if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510                               &team->t.t_bar[bs_reduction_barrier + 1],
511                               sizeof(kmp_balign_team_t),
512                               "%s_%d.t_bar[reduction]", header, team_id);
513#endif // KMP_FAST_REDUCTION_BARRIER
514
515  __kmp_print_storage_map_gtid(
516      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518
519  __kmp_print_storage_map_gtid(
520      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522
523  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524                               &team->t.t_disp_buffer[num_disp_buff],
525                               sizeof(dispatch_shared_info_t) * num_disp_buff,
526                               "%s_%d.t_disp_buffer", header, team_id);
527}
528
529static void __kmp_init_allocator() { __kmp_init_memkind(); }
530static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531
532/* ------------------------------------------------------------------------ */
533
534#if KMP_DYNAMIC_LIB
535#if KMP_OS_WINDOWS
536
537static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538  // TODO: Change to __kmp_break_bootstrap_lock().
539  __kmp_init_bootstrap_lock(lck); // make the lock released
540}
541
542static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543  int i;
544  int thread_count;
545
546  // PROCESS_DETACH is expected to be called by a thread that executes
547  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550  // threads can be still alive here, although being about to be terminated. The
551  // threads in the array with ds_thread==0 are most suspicious. Actually, it
552  // can be not safe to access the __kmp_threads[].
553
554  // TODO: does it make sense to check __kmp_roots[] ?
555
556  // Let's check that there are no other alive threads registered with the OMP
557  // lib.
558  while (1) {
559    thread_count = 0;
560    for (i = 0; i < __kmp_threads_capacity; ++i) {
561      if (!__kmp_threads)
562        continue;
563      kmp_info_t *th = __kmp_threads[i];
564      if (th == NULL)
565        continue;
566      int gtid = th->th.th_info.ds.ds_gtid;
567      if (gtid == gtid_req)
568        continue;
569      if (gtid < 0)
570        continue;
571      DWORD exit_val;
572      int alive = __kmp_is_thread_alive(th, &exit_val);
573      if (alive) {
574        ++thread_count;
575      }
576    }
577    if (thread_count == 0)
578      break; // success
579  }
580
581  // Assume that I'm alone. Now it might be safe to check and reset locks.
582  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583  __kmp_reset_lock(&__kmp_forkjoin_lock);
584#ifdef KMP_DEBUG
585  __kmp_reset_lock(&__kmp_stdio_lock);
586#endif // KMP_DEBUG
587}
588
589BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591
592  switch (fdwReason) {
593
594  case DLL_PROCESS_ATTACH:
595    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596
597    return TRUE;
598
599  case DLL_PROCESS_DETACH:
600    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601
602    if (lpReserved != NULL) {
603      // lpReserved is used for telling the difference:
604      //   lpReserved == NULL when FreeLibrary() was called,
605      //   lpReserved != NULL when the process terminates.
606      // When FreeLibrary() is called, worker threads remain alive. So they will
607      // release the forkjoin lock by themselves. When the process terminates,
608      // worker threads disappear triggering the problem of unreleased forkjoin
609      // lock as described below.
610
611      // A worker thread can take the forkjoin lock. The problem comes up if
612      // that worker thread becomes dead before it releases the forkjoin lock.
613      // The forkjoin lock remains taken, while the thread executing
614      // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615      // to take the forkjoin lock and will always fail, so that the application
616      // will never finish [normally]. This scenario is possible if
617      // __kmpc_end() has not been executed. It looks like it's not a corner
618      // case, but common cases:
619      // - the main function was compiled by an alternative compiler;
620      // - the main function was compiled by icl but without /Qopenmp
621      //   (application with plugins);
622      // - application terminates by calling C exit(), Fortran CALL EXIT() or
623      //   Fortran STOP.
624      // - alive foreign thread prevented __kmpc_end from doing cleanup.
625      //
626      // This is a hack to work around the problem.
627      // TODO: !!! figure out something better.
628      __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629    }
630
631    __kmp_internal_end_library(__kmp_gtid_get_specific());
632
633    return TRUE;
634
635  case DLL_THREAD_ATTACH:
636    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637
638    /* if we want to register new siblings all the time here call
639     * __kmp_get_gtid(); */
640    return TRUE;
641
642  case DLL_THREAD_DETACH:
643    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644
645    __kmp_internal_end_thread(__kmp_gtid_get_specific());
646    return TRUE;
647  }
648
649  return TRUE;
650}
651
652#endif /* KMP_OS_WINDOWS */
653#endif /* KMP_DYNAMIC_LIB */
654
655/* __kmp_parallel_deo -- Wait until it's our turn. */
656void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657  int gtid = *gtid_ref;
658#ifdef BUILD_PARALLEL_ORDERED
659  kmp_team_t *team = __kmp_team_from_gtid(gtid);
660#endif /* BUILD_PARALLEL_ORDERED */
661
662  if (__kmp_env_consistency_check) {
663    if (__kmp_threads[gtid]->th.th_root->r.r_active)
664#if KMP_USE_DYNAMIC_LOCK
665      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666#else
667      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668#endif
669  }
670#ifdef BUILD_PARALLEL_ORDERED
671  if (!team->t.t_serialized) {
672    KMP_MB();
673    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674             NULL);
675    KMP_MB();
676  }
677#endif /* BUILD_PARALLEL_ORDERED */
678}
679
680/* __kmp_parallel_dxo -- Signal the next task. */
681void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683#ifdef BUILD_PARALLEL_ORDERED
684  int tid = __kmp_tid_from_gtid(gtid);
685  kmp_team_t *team = __kmp_team_from_gtid(gtid);
686#endif /* BUILD_PARALLEL_ORDERED */
687
688  if (__kmp_env_consistency_check) {
689    if (__kmp_threads[gtid]->th.th_root->r.r_active)
690      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691  }
692#ifdef BUILD_PARALLEL_ORDERED
693  if (!team->t.t_serialized) {
694    KMP_MB(); /* Flush all pending memory write invalidates.  */
695
696    /* use the tid of the next thread in this team */
697    /* TODO replace with general release procedure */
698    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699
700    KMP_MB(); /* Flush all pending memory write invalidates.  */
701  }
702#endif /* BUILD_PARALLEL_ORDERED */
703}
704
705/* ------------------------------------------------------------------------ */
706/* The BARRIER for a SINGLE process section is always explicit   */
707
708int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709  int status;
710  kmp_info_t *th;
711  kmp_team_t *team;
712
713  if (!TCR_4(__kmp_init_parallel))
714    __kmp_parallel_initialize();
715  __kmp_resume_if_soft_paused();
716
717  th = __kmp_threads[gtid];
718  team = th->th.th_team;
719  status = 0;
720
721  th->th.th_ident = id_ref;
722
723  if (team->t.t_serialized) {
724    status = 1;
725  } else {
726    kmp_int32 old_this = th->th.th_local.this_construct;
727
728    ++th->th.th_local.this_construct;
729    /* try to set team count to thread count--success means thread got the
730       single block */
731    /* TODO: Should this be acquire or release? */
732    if (team->t.t_construct == old_this) {
733      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734                                              th->th.th_local.this_construct);
735    }
736#if USE_ITT_BUILD
737    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739        team->t.t_active_level ==
740            1) { // Only report metadata by master of active team at level 1
741      __kmp_itt_metadata_single(id_ref);
742    }
743#endif /* USE_ITT_BUILD */
744  }
745
746  if (__kmp_env_consistency_check) {
747    if (status && push_ws) {
748      __kmp_push_workshare(gtid, ct_psingle, id_ref);
749    } else {
750      __kmp_check_workshare(gtid, ct_psingle, id_ref);
751    }
752  }
753#if USE_ITT_BUILD
754  if (status) {
755    __kmp_itt_single_start(gtid);
756  }
757#endif /* USE_ITT_BUILD */
758  return status;
759}
760
761void __kmp_exit_single(int gtid) {
762#if USE_ITT_BUILD
763  __kmp_itt_single_end(gtid);
764#endif /* USE_ITT_BUILD */
765  if (__kmp_env_consistency_check)
766    __kmp_pop_workshare(gtid, ct_psingle, NULL);
767}
768
769/* determine if we can go parallel or must use a serialized parallel region and
770 * how many threads we can use
771 * set_nproc is the number of threads requested for the team
772 * returns 0 if we should serialize or only use one thread,
773 * otherwise the number of threads to use
774 * The forkjoin lock is held by the caller. */
775static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776                                 int master_tid, int set_nthreads,
777                                 int enter_teams) {
778  int capacity;
779  int new_nthreads;
780  KMP_DEBUG_ASSERT(__kmp_init_serial);
781  KMP_DEBUG_ASSERT(root && parent_team);
782  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783
784  // If dyn-var is set, dynamically adjust the number of desired threads,
785  // according to the method specified by dynamic_mode.
786  new_nthreads = set_nthreads;
787  if (!get__dynamic_2(parent_team, master_tid)) {
788    ;
789  }
790#ifdef USE_LOAD_BALANCE
791  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793    if (new_nthreads == 1) {
794      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795                    "reservation to 1 thread\n",
796                    master_tid));
797      return 1;
798    }
799    if (new_nthreads < set_nthreads) {
800      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801                    "reservation to %d threads\n",
802                    master_tid, new_nthreads));
803    }
804  }
805#endif /* USE_LOAD_BALANCE */
806  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807    new_nthreads = __kmp_avail_proc - __kmp_nth +
808                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809    if (new_nthreads <= 1) {
810      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811                    "reservation to 1 thread\n",
812                    master_tid));
813      return 1;
814    }
815    if (new_nthreads < set_nthreads) {
816      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817                    "reservation to %d threads\n",
818                    master_tid, new_nthreads));
819    } else {
820      new_nthreads = set_nthreads;
821    }
822  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823    if (set_nthreads > 2) {
824      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825      new_nthreads = (new_nthreads % set_nthreads) + 1;
826      if (new_nthreads == 1) {
827        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828                      "reservation to 1 thread\n",
829                      master_tid));
830        return 1;
831      }
832      if (new_nthreads < set_nthreads) {
833        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834                      "reservation to %d threads\n",
835                      master_tid, new_nthreads));
836      }
837    }
838  } else {
839    KMP_ASSERT(0);
840  }
841
842  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843  if (__kmp_nth + new_nthreads -
844          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845      __kmp_max_nth) {
846    int tl_nthreads = __kmp_max_nth - __kmp_nth +
847                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848    if (tl_nthreads <= 0) {
849      tl_nthreads = 1;
850    }
851
852    // If dyn-var is false, emit a 1-time warning.
853    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854      __kmp_reserve_warn = 1;
855      __kmp_msg(kmp_ms_warning,
856                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858    }
859    if (tl_nthreads == 1) {
860      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861                    "reduced reservation to 1 thread\n",
862                    master_tid));
863      return 1;
864    }
865    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866                  "reservation to %d threads\n",
867                  master_tid, tl_nthreads));
868    new_nthreads = tl_nthreads;
869  }
870
871  // Respect OMP_THREAD_LIMIT
872  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874  if (cg_nthreads + new_nthreads -
875          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876      max_cg_threads) {
877    int tl_nthreads = max_cg_threads - cg_nthreads +
878                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879    if (tl_nthreads <= 0) {
880      tl_nthreads = 1;
881    }
882
883    // If dyn-var is false, emit a 1-time warning.
884    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885      __kmp_reserve_warn = 1;
886      __kmp_msg(kmp_ms_warning,
887                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889    }
890    if (tl_nthreads == 1) {
891      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892                    "reduced reservation to 1 thread\n",
893                    master_tid));
894      return 1;
895    }
896    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897                  "reservation to %d threads\n",
898                  master_tid, tl_nthreads));
899    new_nthreads = tl_nthreads;
900  }
901
902  // Check if the threads array is large enough, or needs expanding.
903  // See comment in __kmp_register_root() about the adjustment if
904  // __kmp_threads[0] == NULL.
905  capacity = __kmp_threads_capacity;
906  if (TCR_PTR(__kmp_threads[0]) == NULL) {
907    --capacity;
908  }
909  if (__kmp_nth + new_nthreads -
910          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911      capacity) {
912    // Expand the threads array.
913    int slotsRequired = __kmp_nth + new_nthreads -
914                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915                        capacity;
916    int slotsAdded = __kmp_expand_threads(slotsRequired);
917    if (slotsAdded < slotsRequired) {
918      // The threads array was not expanded enough.
919      new_nthreads -= (slotsRequired - slotsAdded);
920      KMP_ASSERT(new_nthreads >= 1);
921
922      // If dyn-var is false, emit a 1-time warning.
923      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924        __kmp_reserve_warn = 1;
925        if (__kmp_tp_cached) {
926          __kmp_msg(kmp_ms_warning,
927                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930        } else {
931          __kmp_msg(kmp_ms_warning,
932                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934        }
935      }
936    }
937  }
938
939#ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941    KC_TRACE(10,
942             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943              "dead roots and rechecking; requested %d threads\n",
944              __kmp_get_gtid(), set_nthreads));
945  } else {
946    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947                  " %d threads\n",
948                  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950#endif // KMP_DEBUG
951  return new_nthreads;
952}
953
954/* Allocate threads from the thread pool and assign them to the new team. We are
955   assured that there are enough threads available, because we checked on that
956   earlier within critical section forkjoin */
957static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958                                    kmp_info_t *master_th, int master_gtid) {
959  int i;
960  int use_hot_team;
961
962  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964  KMP_MB();
965
966  /* first, let's setup the master thread */
967  master_th->th.th_info.ds.ds_tid = 0;
968  master_th->th.th_team = team;
969  master_th->th.th_team_nproc = team->t.t_nproc;
970  master_th->th.th_team_master = master_th;
971  master_th->th.th_team_serialized = FALSE;
972  master_th->th.th_dispatch = &team->t.t_dispatch[0];
973
974/* make sure we are not the optimized hot team */
975#if KMP_NESTED_HOT_TEAMS
976  use_hot_team = 0;
977  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978  if (hot_teams) { // hot teams array is not allocated if
979    // KMP_HOT_TEAMS_MAX_LEVEL=0
980    int level = team->t.t_active_level - 1; // index in array of hot teams
981    if (master_th->th.th_teams_microtask) { // are we inside the teams?
982      if (master_th->th.th_teams_size.nteams > 1) {
983        ++level; // level was not increased in teams construct for
984        // team_of_masters
985      }
986      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987          master_th->th.th_teams_level == team->t.t_level) {
988        ++level; // level was not increased in teams construct for
989        // team_of_workers before the parallel
990      } // team->t.t_level will be increased inside parallel
991    }
992    if (level < __kmp_hot_teams_max_level) {
993      if (hot_teams[level].hot_team) {
994        // hot team has already been allocated for given level
995        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996        use_hot_team = 1; // the team is ready to use
997      } else {
998        use_hot_team = 0; // AC: threads are not allocated yet
999        hot_teams[level].hot_team = team; // remember new hot team
1000        hot_teams[level].hot_team_nth = team->t.t_nproc;
1001      }
1002    } else {
1003      use_hot_team = 0;
1004    }
1005  }
1006#else
1007  use_hot_team = team == root->r.r_hot_team;
1008#endif
1009  if (!use_hot_team) {
1010
1011    /* install the master thread */
1012    team->t.t_threads[0] = master_th;
1013    __kmp_initialize_info(master_th, team, 0, master_gtid);
1014
1015    /* now, install the worker threads */
1016    for (i = 1; i < team->t.t_nproc; i++) {
1017
1018      /* fork or reallocate a new thread and install it in team */
1019      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020      team->t.t_threads[i] = thr;
1021      KMP_DEBUG_ASSERT(thr);
1022      KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023      /* align team and thread arrived states */
1024      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029                    team->t.t_bar[bs_plain_barrier].b_arrived));
1030      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031      thr->th.th_teams_level = master_th->th.th_teams_level;
1032      thr->th.th_teams_size = master_th->th.th_teams_size;
1033      { // Initialize threads' barrier data.
1034        int b;
1035        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036        for (b = 0; b < bs_last_barrier; ++b) {
1037          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039#if USE_DEBUGGER
1040          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041#endif
1042        }
1043      }
1044    }
1045
1046#if KMP_AFFINITY_SUPPORTED
1047    __kmp_partition_places(team);
1048#endif
1049  }
1050
1051  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052    for (i = 0; i < team->t.t_nproc; i++) {
1053      kmp_info_t *thr = team->t.t_threads[i];
1054      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055          thr->th.th_prev_level != team->t.t_level) {
1056        team->t.t_display_affinity = 1;
1057        break;
1058      }
1059    }
1060  }
1061
1062  KMP_MB();
1063}
1064
1065#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066// Propagate any changes to the floating point control registers out to the team
1067// We try to avoid unnecessary writes to the relevant cache line in the team
1068// structure, so we don't make changes unless they are needed.
1069inline static void propagateFPControl(kmp_team_t *team) {
1070  if (__kmp_inherit_fp_control) {
1071    kmp_int16 x87_fpu_control_word;
1072    kmp_uint32 mxcsr;
1073
1074    // Get master values of FPU control flags (both X87 and vector)
1075    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076    __kmp_store_mxcsr(&mxcsr);
1077    mxcsr &= KMP_X86_MXCSR_MASK;
1078
1079    // There is no point looking at t_fp_control_saved here.
1080    // If it is TRUE, we still have to update the values if they are different
1081    // from those we now have. If it is FALSE we didn't save anything yet, but
1082    // our objective is the same. We have to ensure that the values in the team
1083    // are the same as those we have.
1084    // So, this code achieves what we need whether or not t_fp_control_saved is
1085    // true. By checking whether the value needs updating we avoid unnecessary
1086    // writes that would put the cache-line into a written state, causing all
1087    // threads in the team to have to read it again.
1088    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090    // Although we don't use this value, other code in the runtime wants to know
1091    // whether it should restore them. So we must ensure it is correct.
1092    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093  } else {
1094    // Similarly here. Don't write to this cache-line in the team structure
1095    // unless we have to.
1096    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097  }
1098}
1099
1100// Do the opposite, setting the hardware registers to the updated values from
1101// the team.
1102inline static void updateHWFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104    // Only reset the fp control regs if they have been changed in the team.
1105    // the parallel region that we are exiting.
1106    kmp_int16 x87_fpu_control_word;
1107    kmp_uint32 mxcsr;
1108    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109    __kmp_store_mxcsr(&mxcsr);
1110    mxcsr &= KMP_X86_MXCSR_MASK;
1111
1112    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113      __kmp_clear_x87_fpu_status_word();
1114      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115    }
1116
1117    if (team->t.t_mxcsr != mxcsr) {
1118      __kmp_load_mxcsr(&team->t.t_mxcsr);
1119    }
1120  }
1121}
1122#else
1123#define propagateFPControl(x) ((void)0)
1124#define updateHWFPControl(x) ((void)0)
1125#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126
1127static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128                                     int realloc); // forward declaration
1129
1130/* Run a parallel region that has been serialized, so runs only in a team of the
1131   single master thread. */
1132void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133  kmp_info_t *this_thr;
1134  kmp_team_t *serial_team;
1135
1136  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137
1138  /* Skip all this code for autopar serialized loops since it results in
1139     unacceptable overhead */
1140  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141    return;
1142
1143  if (!TCR_4(__kmp_init_parallel))
1144    __kmp_parallel_initialize();
1145  __kmp_resume_if_soft_paused();
1146
1147  this_thr = __kmp_threads[global_tid];
1148  serial_team = this_thr->th.th_serial_team;
1149
1150  /* utilize the serialized team held by this thread */
1151  KMP_DEBUG_ASSERT(serial_team);
1152  KMP_MB();
1153
1154  if (__kmp_tasking_mode != tskm_immediate_exec) {
1155    KMP_DEBUG_ASSERT(
1156        this_thr->th.th_task_team ==
1157        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159                     NULL);
1160    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161                  "team %p, new task_team = NULL\n",
1162                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163    this_thr->th.th_task_team = NULL;
1164  }
1165
1166  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168    proc_bind = proc_bind_false;
1169  } else if (proc_bind == proc_bind_default) {
1170    // No proc_bind clause was specified, so use the current value
1171    // of proc-bind-var for this parallel region.
1172    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173  }
1174  // Reset for next parallel region
1175  this_thr->th.th_set_proc_bind = proc_bind_default;
1176
1177#if OMPT_SUPPORT
1178  ompt_data_t ompt_parallel_data = ompt_data_none;
1179  ompt_data_t *implicit_task_data;
1180  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181  if (ompt_enabled.enabled &&
1182      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183
1184    ompt_task_info_t *parent_task_info;
1185    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186
1187    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188    if (ompt_enabled.ompt_callback_parallel_begin) {
1189      int team_size = 1;
1190
1191      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192          &(parent_task_info->task_data), &(parent_task_info->frame),
1193          &ompt_parallel_data, team_size,
1194          ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1195    }
1196  }
1197#endif // OMPT_SUPPORT
1198
1199  if (this_thr->th.th_team != serial_team) {
1200    // Nested level will be an index in the nested nthreads array
1201    int level = this_thr->th.th_team->t.t_level;
1202
1203    if (serial_team->t.t_serialized) {
1204      /* this serial team was already used
1205         TODO increase performance by making this locks more specific */
1206      kmp_team_t *new_team;
1207
1208      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209
1210      new_team =
1211          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212#if OMPT_SUPPORT
1213                              ompt_parallel_data,
1214#endif
1215                              proc_bind, &this_thr->th.th_current_task->td_icvs,
1216                              0 USE_NESTED_HOT_ARG(NULL));
1217      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218      KMP_ASSERT(new_team);
1219
1220      /* setup new serialized team and install it */
1221      new_team->t.t_threads[0] = this_thr;
1222      new_team->t.t_parent = this_thr->th.th_team;
1223      serial_team = new_team;
1224      this_thr->th.th_serial_team = serial_team;
1225
1226      KF_TRACE(
1227          10,
1228          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229           global_tid, serial_team));
1230
1231      /* TODO the above breaks the requirement that if we run out of resources,
1232         then we can still guarantee that serialized teams are ok, since we may
1233         need to allocate a new one */
1234    } else {
1235      KF_TRACE(
1236          10,
1237          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238           global_tid, serial_team));
1239    }
1240
1241    /* we have to initialize this serial team */
1242    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245    serial_team->t.t_ident = loc;
1246    serial_team->t.t_serialized = 1;
1247    serial_team->t.t_nproc = 1;
1248    serial_team->t.t_parent = this_thr->th.th_team;
1249    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250    this_thr->th.th_team = serial_team;
1251    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252
1253    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254                  this_thr->th.th_current_task));
1255    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256    this_thr->th.th_current_task->td_flags.executing = 0;
1257
1258    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259
1260    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261       implicit task for each serialized task represented by
1262       team->t.t_serialized? */
1263    copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264              &this_thr->th.th_current_task->td_parent->td_icvs);
1265
1266    // Thread value exists in the nested nthreads array for the next nested
1267    // level
1268    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269      this_thr->th.th_current_task->td_icvs.nproc =
1270          __kmp_nested_nth.nth[level + 1];
1271    }
1272
1273    if (__kmp_nested_proc_bind.used &&
1274        (level + 1 < __kmp_nested_proc_bind.used)) {
1275      this_thr->th.th_current_task->td_icvs.proc_bind =
1276          __kmp_nested_proc_bind.bind_types[level + 1];
1277    }
1278
1279#if USE_DEBUGGER
1280    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281#endif
1282    this_thr->th.th_info.ds.ds_tid = 0;
1283
1284    /* set thread cache values */
1285    this_thr->th.th_team_nproc = 1;
1286    this_thr->th.th_team_master = this_thr;
1287    this_thr->th.th_team_serialized = 1;
1288
1289    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292
1293    propagateFPControl(serial_team);
1294
1295    /* check if we need to allocate dispatch buffers stack */
1296    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297    if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298      serial_team->t.t_dispatch->th_disp_buffer =
1299          (dispatch_private_info_t *)__kmp_allocate(
1300              sizeof(dispatch_private_info_t));
1301    }
1302    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303
1304    KMP_MB();
1305
1306  } else {
1307    /* this serialized team is already being used,
1308     * that's fine, just add another nested level */
1309    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312    ++serial_team->t.t_serialized;
1313    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314
1315    // Nested level will be an index in the nested nthreads array
1316    int level = this_thr->th.th_team->t.t_level;
1317    // Thread value exists in the nested nthreads array for the next nested
1318    // level
1319    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320      this_thr->th.th_current_task->td_icvs.nproc =
1321          __kmp_nested_nth.nth[level + 1];
1322    }
1323    serial_team->t.t_level++;
1324    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325                  "of serial team %p to %d\n",
1326                  global_tid, serial_team, serial_team->t.t_level));
1327
1328    /* allocate/push dispatch buffers stack */
1329    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330    {
1331      dispatch_private_info_t *disp_buffer =
1332          (dispatch_private_info_t *)__kmp_allocate(
1333              sizeof(dispatch_private_info_t));
1334      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336    }
1337    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338
1339    KMP_MB();
1340  }
1341  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342
1343  // Perform the display affinity functionality for
1344  // serialized parallel regions
1345  if (__kmp_display_affinity) {
1346    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347        this_thr->th.th_prev_num_threads != 1) {
1348      // NULL means use the affinity-format-var ICV
1349      __kmp_aux_display_affinity(global_tid, NULL);
1350      this_thr->th.th_prev_level = serial_team->t.t_level;
1351      this_thr->th.th_prev_num_threads = 1;
1352    }
1353  }
1354
1355  if (__kmp_env_consistency_check)
1356    __kmp_push_parallel(global_tid, NULL);
1357#if OMPT_SUPPORT
1358  serial_team->t.ompt_team_info.master_return_address = codeptr;
1359  if (ompt_enabled.enabled &&
1360      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362
1363    ompt_lw_taskteam_t lw_taskteam;
1364    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365                            &ompt_parallel_data, codeptr);
1366
1367    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368    // don't use lw_taskteam after linking. content was swaped
1369
1370    /* OMPT implicit task begin */
1371    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372    if (ompt_enabled.ompt_callback_implicit_task) {
1373      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376      OMPT_CUR_TASK_INFO(this_thr)
1377          ->thread_num = __kmp_tid_from_gtid(global_tid);
1378    }
1379
1380    /* OMPT state */
1381    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383  }
1384#endif
1385}
1386
1387/* most of the work for a fork */
1388/* return true if we really went parallel, false if serialized */
1389int __kmp_fork_call(ident_t *loc, int gtid,
1390                    enum fork_context_e call_context, // Intel, GNU, ...
1391                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392/* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1394                    va_list *ap
1395#else
1396                    va_list ap
1397#endif
1398                    ) {
1399  void **argv;
1400  int i;
1401  int master_tid;
1402  int master_this_cons;
1403  kmp_team_t *team;
1404  kmp_team_t *parent_team;
1405  kmp_info_t *master_th;
1406  kmp_root_t *root;
1407  int nthreads;
1408  int master_active;
1409  int master_set_numthreads;
1410  int level;
1411  int active_level;
1412  int teams_level;
1413#if KMP_NESTED_HOT_TEAMS
1414  kmp_hot_team_ptr_t **p_hot_teams;
1415#endif
1416  { // KMP_TIME_BLOCK
1417    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1419
1420    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422      /* Some systems prefer the stack for the root thread(s) to start with */
1423      /* some gap from the parent stack to prevent false sharing. */
1424      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425      /* These 2 lines below are so this does not get optimized out */
1426      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427        __kmp_stkpadding += (short)((kmp_int64)dummy);
1428    }
1429
1430    /* initialize if needed */
1431    KMP_DEBUG_ASSERT(
1432        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433    if (!TCR_4(__kmp_init_parallel))
1434      __kmp_parallel_initialize();
1435    __kmp_resume_if_soft_paused();
1436
1437    /* setup current data */
1438    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1439    // shutdown
1440    parent_team = master_th->th.th_team;
1441    master_tid = master_th->th.th_info.ds.ds_tid;
1442    master_this_cons = master_th->th.th_local.this_construct;
1443    root = master_th->th.th_root;
1444    master_active = root->r.r_active;
1445    master_set_numthreads = master_th->th.th_set_nproc;
1446
1447#if OMPT_SUPPORT
1448    ompt_data_t ompt_parallel_data = ompt_data_none;
1449    ompt_data_t *parent_task_data;
1450    ompt_frame_t *ompt_frame;
1451    ompt_data_t *implicit_task_data;
1452    void *return_address = NULL;
1453
1454    if (ompt_enabled.enabled) {
1455      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1456                                    NULL, NULL);
1457      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1458    }
1459#endif
1460
1461    // Nested level will be an index in the nested nthreads array
1462    level = parent_team->t.t_level;
1463    // used to launch non-serial teams even if nested is not allowed
1464    active_level = parent_team->t.t_active_level;
1465    // needed to check nesting inside the teams
1466    teams_level = master_th->th.th_teams_level;
1467#if KMP_NESTED_HOT_TEAMS
1468    p_hot_teams = &master_th->th.th_hot_teams;
1469    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473      // it is either actual or not needed (when active_level > 0)
1474      (*p_hot_teams)[0].hot_team_nth = 1;
1475    }
1476#endif
1477
1478#if OMPT_SUPPORT
1479    if (ompt_enabled.enabled) {
1480      if (ompt_enabled.ompt_callback_parallel_begin) {
1481        int team_size = master_set_numthreads
1482                            ? master_set_numthreads
1483                            : get__nproc_2(parent_team, master_tid);
1484        int flags = OMPT_INVOKER(call_context) |
1485                    ((microtask == (microtask_t)__kmp_teams_master)
1486                         ? ompt_parallel_league
1487                         : ompt_parallel_team);
1488        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1489            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1490            return_address);
1491      }
1492      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1493    }
1494#endif
1495
1496    master_th->th.th_ident = loc;
1497
1498    if (master_th->th.th_teams_microtask && ap &&
1499        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1500      // AC: This is start of parallel that is nested inside teams construct.
1501      // The team is actual (hot), all workers are ready at the fork barrier.
1502      // No lock needed to initialize the team a bit, then free workers.
1503      parent_team->t.t_ident = loc;
1504      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1505      parent_team->t.t_argc = argc;
1506      argv = (void **)parent_team->t.t_argv;
1507      for (i = argc - 1; i >= 0; --i)
1508/* TODO: revert workaround for Intel(R) 64 tracker #96 */
1509#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1510        *argv++ = va_arg(*ap, void *);
1511#else
1512        *argv++ = va_arg(ap, void *);
1513#endif
1514      // Increment our nested depth levels, but not increase the serialization
1515      if (parent_team == master_th->th.th_serial_team) {
1516        // AC: we are in serialized parallel
1517        __kmpc_serialized_parallel(loc, gtid);
1518        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1519
1520#if OMPT_SUPPORT
1521        void *dummy;
1522        void **exit_frame_p;
1523
1524        ompt_lw_taskteam_t lw_taskteam;
1525
1526        if (ompt_enabled.enabled) {
1527          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1528                                  &ompt_parallel_data, return_address);
1529          exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1530
1531          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1532          // don't use lw_taskteam after linking. content was swaped
1533
1534          /* OMPT implicit task begin */
1535          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1536          if (ompt_enabled.ompt_callback_implicit_task) {
1537            OMPT_CUR_TASK_INFO(master_th)
1538                ->thread_num = __kmp_tid_from_gtid(gtid);
1539            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1540                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1541                implicit_task_data, 1,
1542                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543          }
1544
1545          /* OMPT state */
1546          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1547        } else {
1548          exit_frame_p = &dummy;
1549        }
1550#endif
1551        // AC: need to decrement t_serialized for enquiry functions to work
1552        // correctly, will restore at join time
1553        parent_team->t.t_serialized--;
1554
1555        {
1556          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1557          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1558          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1559#if OMPT_SUPPORT
1560                                 ,
1561                                 exit_frame_p
1562#endif
1563                                 );
1564        }
1565
1566#if OMPT_SUPPORT
1567        if (ompt_enabled.enabled) {
1568          *exit_frame_p = NULL;
1569          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1570          if (ompt_enabled.ompt_callback_implicit_task) {
1571            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1572                ompt_scope_end, NULL, implicit_task_data, 1,
1573                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1574          }
1575          ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1576          __ompt_lw_taskteam_unlink(master_th);
1577          if (ompt_enabled.ompt_callback_parallel_end) {
1578            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1579                &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1580                OMPT_INVOKER(call_context) | ompt_parallel_team,
1581                return_address);
1582          }
1583          master_th->th.ompt_thread_info.state = ompt_state_overhead;
1584        }
1585#endif
1586        return TRUE;
1587      }
1588
1589      parent_team->t.t_pkfn = microtask;
1590      parent_team->t.t_invoke = invoker;
1591      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1592      parent_team->t.t_active_level++;
1593      parent_team->t.t_level++;
1594      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1595
1596#if OMPT_SUPPORT
1597      if (ompt_enabled.enabled) {
1598        ompt_lw_taskteam_t lw_taskteam;
1599        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1600                                &ompt_parallel_data, return_address);
1601        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1602      }
1603#endif
1604
1605      /* Change number of threads in the team if requested */
1606      if (master_set_numthreads) { // The parallel has num_threads clause
1607        if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1608          // AC: only can reduce number of threads dynamically, can't increase
1609          kmp_info_t **other_threads = parent_team->t.t_threads;
1610          parent_team->t.t_nproc = master_set_numthreads;
1611          for (i = 0; i < master_set_numthreads; ++i) {
1612            other_threads[i]->th.th_team_nproc = master_set_numthreads;
1613          }
1614          // Keep extra threads hot in the team for possible next parallels
1615        }
1616        master_th->th.th_set_nproc = 0;
1617      }
1618
1619#if USE_DEBUGGER
1620      if (__kmp_debugging) { // Let debugger override number of threads.
1621        int nth = __kmp_omp_num_threads(loc);
1622        if (nth > 0) { // 0 means debugger doesn't want to change num threads
1623          master_set_numthreads = nth;
1624        }
1625      }
1626#endif
1627
1628      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1629                    "master_th=%p, gtid=%d\n",
1630                    root, parent_team, master_th, gtid));
1631      __kmp_internal_fork(loc, gtid, parent_team);
1632      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1633                    "master_th=%p, gtid=%d\n",
1634                    root, parent_team, master_th, gtid));
1635
1636      /* Invoke microtask for MASTER thread */
1637      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1638                    parent_team->t.t_id, parent_team->t.t_pkfn));
1639
1640      if (!parent_team->t.t_invoke(gtid)) {
1641        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1642      }
1643      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1644                    parent_team->t.t_id, parent_team->t.t_pkfn));
1645      KMP_MB(); /* Flush all pending memory write invalidates.  */
1646
1647      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1648
1649      return TRUE;
1650    } // Parallel closely nested in teams construct
1651
1652#if KMP_DEBUG
1653    if (__kmp_tasking_mode != tskm_immediate_exec) {
1654      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1655                       parent_team->t.t_task_team[master_th->th.th_task_state]);
1656    }
1657#endif
1658
1659    if (parent_team->t.t_active_level >=
1660        master_th->th.th_current_task->td_icvs.max_active_levels) {
1661      nthreads = 1;
1662    } else {
1663      int enter_teams = ((ap == NULL && active_level == 0) ||
1664                         (ap && teams_level > 0 && teams_level == level));
1665      nthreads =
1666          master_set_numthreads
1667              ? master_set_numthreads
1668              : get__nproc_2(
1669                    parent_team,
1670                    master_tid); // TODO: get nproc directly from current task
1671
1672      // Check if we need to take forkjoin lock? (no need for serialized
1673      // parallel out of teams construct). This code moved here from
1674      // __kmp_reserve_threads() to speedup nested serialized parallels.
1675      if (nthreads > 1) {
1676        if ((get__max_active_levels(master_th) == 1 &&
1677             (root->r.r_in_parallel && !enter_teams)) ||
1678            (__kmp_library == library_serial)) {
1679          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1680                        " threads\n",
1681                        gtid, nthreads));
1682          nthreads = 1;
1683        }
1684      }
1685      if (nthreads > 1) {
1686        /* determine how many new threads we can use */
1687        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1688        /* AC: If we execute teams from parallel region (on host), then teams
1689           should be created but each can only have 1 thread if nesting is
1690           disabled. If teams called from serial region, then teams and their
1691           threads should be created regardless of the nesting setting. */
1692        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1693                                         nthreads, enter_teams);
1694        if (nthreads == 1) {
1695          // Free lock for single thread execution here; for multi-thread
1696          // execution it will be freed later after team of threads created
1697          // and initialized
1698          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1699        }
1700      }
1701    }
1702    KMP_DEBUG_ASSERT(nthreads > 0);
1703
1704    // If we temporarily changed the set number of threads then restore it now
1705    master_th->th.th_set_nproc = 0;
1706
1707    /* create a serialized parallel region? */
1708    if (nthreads == 1) {
1709/* josh todo: hypothetical question: what do we do for OS X*? */
1710#if KMP_OS_LINUX &&                                                            \
1711    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1712      void *args[argc];
1713#else
1714      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1715#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1716          KMP_ARCH_AARCH64) */
1717
1718      KA_TRACE(20,
1719               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1720
1721      __kmpc_serialized_parallel(loc, gtid);
1722
1723      if (call_context == fork_context_intel) {
1724        /* TODO this sucks, use the compiler itself to pass args! :) */
1725        master_th->th.th_serial_team->t.t_ident = loc;
1726        if (!ap) {
1727          // revert change made in __kmpc_serialized_parallel()
1728          master_th->th.th_serial_team->t.t_level--;
1729// Get args from parent team for teams construct
1730
1731#if OMPT_SUPPORT
1732          void *dummy;
1733          void **exit_frame_p;
1734          ompt_task_info_t *task_info;
1735
1736          ompt_lw_taskteam_t lw_taskteam;
1737
1738          if (ompt_enabled.enabled) {
1739            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1740                                    &ompt_parallel_data, return_address);
1741
1742            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1743            // don't use lw_taskteam after linking. content was swaped
1744
1745            task_info = OMPT_CUR_TASK_INFO(master_th);
1746            exit_frame_p = &(task_info->frame.exit_frame.ptr);
1747            if (ompt_enabled.ompt_callback_implicit_task) {
1748              OMPT_CUR_TASK_INFO(master_th)
1749                  ->thread_num = __kmp_tid_from_gtid(gtid);
1750              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1751                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1752                  &(task_info->task_data), 1,
1753                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1754                  ompt_task_implicit);
1755            }
1756
1757            /* OMPT state */
1758            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1759          } else {
1760            exit_frame_p = &dummy;
1761          }
1762#endif
1763
1764          {
1765            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1766            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1767            __kmp_invoke_microtask(microtask, gtid, 0, argc,
1768                                   parent_team->t.t_argv
1769#if OMPT_SUPPORT
1770                                   ,
1771                                   exit_frame_p
1772#endif
1773                                   );
1774          }
1775
1776#if OMPT_SUPPORT
1777          if (ompt_enabled.enabled) {
1778            *exit_frame_p = NULL;
1779            if (ompt_enabled.ompt_callback_implicit_task) {
1780              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1781                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1782                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1783                  ompt_task_implicit);
1784            }
1785            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1786            __ompt_lw_taskteam_unlink(master_th);
1787            if (ompt_enabled.ompt_callback_parallel_end) {
1788              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1789                  &ompt_parallel_data, parent_task_data,
1790                  OMPT_INVOKER(call_context) | ompt_parallel_team,
1791                  return_address);
1792            }
1793            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1794          }
1795#endif
1796        } else if (microtask == (microtask_t)__kmp_teams_master) {
1797          KMP_DEBUG_ASSERT(master_th->th.th_team ==
1798                           master_th->th.th_serial_team);
1799          team = master_th->th.th_team;
1800          // team->t.t_pkfn = microtask;
1801          team->t.t_invoke = invoker;
1802          __kmp_alloc_argv_entries(argc, team, TRUE);
1803          team->t.t_argc = argc;
1804          argv = (void **)team->t.t_argv;
1805          if (ap) {
1806            for (i = argc - 1; i >= 0; --i)
1807// TODO: revert workaround for Intel(R) 64 tracker #96
1808#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1809              *argv++ = va_arg(*ap, void *);
1810#else
1811              *argv++ = va_arg(ap, void *);
1812#endif
1813          } else {
1814            for (i = 0; i < argc; ++i)
1815              // Get args from parent team for teams construct
1816              argv[i] = parent_team->t.t_argv[i];
1817          }
1818          // AC: revert change made in __kmpc_serialized_parallel()
1819          //     because initial code in teams should have level=0
1820          team->t.t_level--;
1821          // AC: call special invoker for outer "parallel" of teams construct
1822          invoker(gtid);
1823#if OMPT_SUPPORT
1824          if (ompt_enabled.enabled) {
1825            ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1826            if (ompt_enabled.ompt_callback_implicit_task) {
1827              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828                  ompt_scope_end, NULL, &(task_info->task_data), 0,
1829                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1830            }
1831            if (ompt_enabled.ompt_callback_parallel_end) {
1832              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1833                  &ompt_parallel_data, parent_task_data,
1834                  OMPT_INVOKER(call_context) | ompt_parallel_league,
1835                  return_address);
1836            }
1837            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1838          }
1839#endif
1840        } else {
1841          argv = args;
1842          for (i = argc - 1; i >= 0; --i)
1843// TODO: revert workaround for Intel(R) 64 tracker #96
1844#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1845            *argv++ = va_arg(*ap, void *);
1846#else
1847            *argv++ = va_arg(ap, void *);
1848#endif
1849          KMP_MB();
1850
1851#if OMPT_SUPPORT
1852          void *dummy;
1853          void **exit_frame_p;
1854          ompt_task_info_t *task_info;
1855
1856          ompt_lw_taskteam_t lw_taskteam;
1857
1858          if (ompt_enabled.enabled) {
1859            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1860                                    &ompt_parallel_data, return_address);
1861            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1862            // don't use lw_taskteam after linking. content was swaped
1863            task_info = OMPT_CUR_TASK_INFO(master_th);
1864            exit_frame_p = &(task_info->frame.exit_frame.ptr);
1865
1866            /* OMPT implicit task begin */
1867            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1868            if (ompt_enabled.ompt_callback_implicit_task) {
1869              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1870                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1871                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1872                  ompt_task_implicit);
1873              OMPT_CUR_TASK_INFO(master_th)
1874                  ->thread_num = __kmp_tid_from_gtid(gtid);
1875            }
1876
1877            /* OMPT state */
1878            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1879          } else {
1880            exit_frame_p = &dummy;
1881          }
1882#endif
1883
1884          {
1885            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1886            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1887            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1888#if OMPT_SUPPORT
1889                                   ,
1890                                   exit_frame_p
1891#endif
1892                                   );
1893          }
1894
1895#if OMPT_SUPPORT
1896          if (ompt_enabled.enabled) {
1897            *exit_frame_p = NULL;
1898            if (ompt_enabled.ompt_callback_implicit_task) {
1899              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1900                  ompt_scope_end, NULL, &(task_info->task_data), 1,
1901                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1902                  ompt_task_implicit);
1903            }
1904
1905            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1906            __ompt_lw_taskteam_unlink(master_th);
1907            if (ompt_enabled.ompt_callback_parallel_end) {
1908              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1909                  &ompt_parallel_data, parent_task_data,
1910                  OMPT_INVOKER(call_context) | ompt_parallel_team,
1911                  return_address);
1912            }
1913            master_th->th.ompt_thread_info.state = ompt_state_overhead;
1914          }
1915#endif
1916        }
1917      } else if (call_context == fork_context_gnu) {
1918#if OMPT_SUPPORT
1919        ompt_lw_taskteam_t lwt;
1920        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1921                                return_address);
1922
1923        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1924        __ompt_lw_taskteam_link(&lwt, master_th, 1);
1925// don't use lw_taskteam after linking. content was swaped
1926#endif
1927
1928        // we were called from GNU native code
1929        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1930        return FALSE;
1931      } else {
1932        KMP_ASSERT2(call_context < fork_context_last,
1933                    "__kmp_fork_call: unknown fork_context parameter");
1934      }
1935
1936      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1937      KMP_MB();
1938      return FALSE;
1939    } // if (nthreads == 1)
1940
1941    // GEH: only modify the executing flag in the case when not serialized
1942    //      serialized case is handled in kmpc_serialized_parallel
1943    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1944                  "curtask=%p, curtask_max_aclevel=%d\n",
1945                  parent_team->t.t_active_level, master_th,
1946                  master_th->th.th_current_task,
1947                  master_th->th.th_current_task->td_icvs.max_active_levels));
1948    // TODO: GEH - cannot do this assertion because root thread not set up as
1949    // executing
1950    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1951    master_th->th.th_current_task->td_flags.executing = 0;
1952
1953    if (!master_th->th.th_teams_microtask || level > teams_level) {
1954      /* Increment our nested depth level */
1955      KMP_ATOMIC_INC(&root->r.r_in_parallel);
1956    }
1957
1958    // See if we need to make a copy of the ICVs.
1959    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1960    if ((level + 1 < __kmp_nested_nth.used) &&
1961        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1962      nthreads_icv = __kmp_nested_nth.nth[level + 1];
1963    } else {
1964      nthreads_icv = 0; // don't update
1965    }
1966
1967    // Figure out the proc_bind_policy for the new team.
1968    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1969    kmp_proc_bind_t proc_bind_icv =
1970        proc_bind_default; // proc_bind_default means don't update
1971    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1972      proc_bind = proc_bind_false;
1973    } else {
1974      if (proc_bind == proc_bind_default) {
1975        // No proc_bind clause specified; use current proc-bind-var for this
1976        // parallel region
1977        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1978      }
1979      /* else: The proc_bind policy was specified explicitly on parallel clause.
1980         This overrides proc-bind-var for this parallel region, but does not
1981         change proc-bind-var. */
1982      // Figure the value of proc-bind-var for the child threads.
1983      if ((level + 1 < __kmp_nested_proc_bind.used) &&
1984          (__kmp_nested_proc_bind.bind_types[level + 1] !=
1985           master_th->th.th_current_task->td_icvs.proc_bind)) {
1986        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1987      }
1988    }
1989
1990    // Reset for next parallel region
1991    master_th->th.th_set_proc_bind = proc_bind_default;
1992
1993    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1994      kmp_internal_control_t new_icvs;
1995      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1996      new_icvs.next = NULL;
1997      if (nthreads_icv > 0) {
1998        new_icvs.nproc = nthreads_icv;
1999      }
2000      if (proc_bind_icv != proc_bind_default) {
2001        new_icvs.proc_bind = proc_bind_icv;
2002      }
2003
2004      /* allocate a new parallel team */
2005      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2006      team = __kmp_allocate_team(root, nthreads, nthreads,
2007#if OMPT_SUPPORT
2008                                 ompt_parallel_data,
2009#endif
2010                                 proc_bind, &new_icvs,
2011                                 argc USE_NESTED_HOT_ARG(master_th));
2012    } else {
2013      /* allocate a new parallel team */
2014      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2015      team = __kmp_allocate_team(root, nthreads, nthreads,
2016#if OMPT_SUPPORT
2017                                 ompt_parallel_data,
2018#endif
2019                                 proc_bind,
2020                                 &master_th->th.th_current_task->td_icvs,
2021                                 argc USE_NESTED_HOT_ARG(master_th));
2022    }
2023    KF_TRACE(
2024        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2025
2026    /* setup the new team */
2027    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2028    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2029    KMP_CHECK_UPDATE(team->t.t_ident, loc);
2030    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2031    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2032#if OMPT_SUPPORT
2033    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2034                          return_address);
2035#endif
2036    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2037    // TODO: parent_team->t.t_level == INT_MAX ???
2038    if (!master_th->th.th_teams_microtask || level > teams_level) {
2039      int new_level = parent_team->t.t_level + 1;
2040      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2041      new_level = parent_team->t.t_active_level + 1;
2042      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2043    } else {
2044      // AC: Do not increase parallel level at start of the teams construct
2045      int new_level = parent_team->t.t_level;
2046      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2047      new_level = parent_team->t.t_active_level;
2048      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2049    }
2050    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2051    // set master's schedule as new run-time schedule
2052    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2053
2054    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2055    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2056
2057    // Update the floating point rounding in the team if required.
2058    propagateFPControl(team);
2059
2060    if (__kmp_tasking_mode != tskm_immediate_exec) {
2061      // Set master's task team to team's task team. Unless this is hot team, it
2062      // should be NULL.
2063      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2064                       parent_team->t.t_task_team[master_th->th.th_task_state]);
2065      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2066                    "%p, new task_team %p / team %p\n",
2067                    __kmp_gtid_from_thread(master_th),
2068                    master_th->th.th_task_team, parent_team,
2069                    team->t.t_task_team[master_th->th.th_task_state], team));
2070
2071      if (active_level || master_th->th.th_task_team) {
2072        // Take a memo of master's task_state
2073        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2074        if (master_th->th.th_task_state_top >=
2075            master_th->th.th_task_state_stack_sz) { // increase size
2076          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2077          kmp_uint8 *old_stack, *new_stack;
2078          kmp_uint32 i;
2079          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2080          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2081            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2082          }
2083          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2084               ++i) { // zero-init rest of stack
2085            new_stack[i] = 0;
2086          }
2087          old_stack = master_th->th.th_task_state_memo_stack;
2088          master_th->th.th_task_state_memo_stack = new_stack;
2089          master_th->th.th_task_state_stack_sz = new_size;
2090          __kmp_free(old_stack);
2091        }
2092        // Store master's task_state on stack
2093        master_th->th
2094            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2095            master_th->th.th_task_state;
2096        master_th->th.th_task_state_top++;
2097#if KMP_NESTED_HOT_TEAMS
2098        if (master_th->th.th_hot_teams &&
2099            active_level < __kmp_hot_teams_max_level &&
2100            team == master_th->th.th_hot_teams[active_level].hot_team) {
2101          // Restore master's nested state if nested hot team
2102          master_th->th.th_task_state =
2103              master_th->th
2104                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2105        } else {
2106#endif
2107          master_th->th.th_task_state = 0;
2108#if KMP_NESTED_HOT_TEAMS
2109        }
2110#endif
2111      }
2112#if !KMP_NESTED_HOT_TEAMS
2113      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2114                       (team == root->r.r_hot_team));
2115#endif
2116    }
2117
2118    KA_TRACE(
2119        20,
2120        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2121         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2122         team->t.t_nproc));
2123    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2124                     (team->t.t_master_tid == 0 &&
2125                      (team->t.t_parent == root->r.r_root_team ||
2126                       team->t.t_parent->t.t_serialized)));
2127    KMP_MB();
2128
2129    /* now, setup the arguments */
2130    argv = (void **)team->t.t_argv;
2131    if (ap) {
2132      for (i = argc - 1; i >= 0; --i) {
2133// TODO: revert workaround for Intel(R) 64 tracker #96
2134#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2135        void *new_argv = va_arg(*ap, void *);
2136#else
2137        void *new_argv = va_arg(ap, void *);
2138#endif
2139        KMP_CHECK_UPDATE(*argv, new_argv);
2140        argv++;
2141      }
2142    } else {
2143      for (i = 0; i < argc; ++i) {
2144        // Get args from parent team for teams construct
2145        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2146      }
2147    }
2148
2149    /* now actually fork the threads */
2150    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2151    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2152      root->r.r_active = TRUE;
2153
2154    __kmp_fork_team_threads(root, team, master_th, gtid);
2155    __kmp_setup_icv_copy(team, nthreads,
2156                         &master_th->th.th_current_task->td_icvs, loc);
2157
2158#if OMPT_SUPPORT
2159    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2160#endif
2161
2162    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2163
2164#if USE_ITT_BUILD
2165    if (team->t.t_active_level == 1 // only report frames at level 1
2166        && !master_th->th.th_teams_microtask) { // not in teams construct
2167#if USE_ITT_NOTIFY
2168      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2169          (__kmp_forkjoin_frames_mode == 3 ||
2170           __kmp_forkjoin_frames_mode == 1)) {
2171        kmp_uint64 tmp_time = 0;
2172        if (__itt_get_timestamp_ptr)
2173          tmp_time = __itt_get_timestamp();
2174        // Internal fork - report frame begin
2175        master_th->th.th_frame_time = tmp_time;
2176        if (__kmp_forkjoin_frames_mode == 3)
2177          team->t.t_region_time = tmp_time;
2178      } else
2179// only one notification scheme (either "submit" or "forking/joined", not both)
2180#endif /* USE_ITT_NOTIFY */
2181          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2182              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2183        // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2184        __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2185      }
2186    }
2187#endif /* USE_ITT_BUILD */
2188
2189    /* now go on and do the work */
2190    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2191    KMP_MB();
2192    KF_TRACE(10,
2193             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2194              root, team, master_th, gtid));
2195
2196#if USE_ITT_BUILD
2197    if (__itt_stack_caller_create_ptr) {
2198      team->t.t_stack_id =
2199          __kmp_itt_stack_caller_create(); // create new stack stitching id
2200      // before entering fork barrier
2201    }
2202#endif /* USE_ITT_BUILD */
2203
2204    // AC: skip __kmp_internal_fork at teams construct, let only master
2205    // threads execute
2206    if (ap) {
2207      __kmp_internal_fork(loc, gtid, team);
2208      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2209                    "master_th=%p, gtid=%d\n",
2210                    root, team, master_th, gtid));
2211    }
2212
2213    if (call_context == fork_context_gnu) {
2214      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215      return TRUE;
2216    }
2217
2218    /* Invoke microtask for MASTER thread */
2219    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2220                  team->t.t_id, team->t.t_pkfn));
2221  } // END of timer KMP_fork_call block
2222
2223#if KMP_STATS_ENABLED
2224  // If beginning a teams construct, then change thread state
2225  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2226  if (!ap) {
2227    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2228  }
2229#endif
2230
2231  if (!team->t.t_invoke(gtid)) {
2232    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2233  }
2234
2235#if KMP_STATS_ENABLED
2236  // If was beginning of a teams construct, then reset thread state
2237  if (!ap) {
2238    KMP_SET_THREAD_STATE(previous_state);
2239  }
2240#endif
2241
2242  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2243                team->t.t_id, team->t.t_pkfn));
2244  KMP_MB(); /* Flush all pending memory write invalidates.  */
2245
2246  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2247
2248#if OMPT_SUPPORT
2249  if (ompt_enabled.enabled) {
2250    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2251  }
2252#endif
2253
2254  return TRUE;
2255}
2256
2257#if OMPT_SUPPORT
2258static inline void __kmp_join_restore_state(kmp_info_t *thread,
2259                                            kmp_team_t *team) {
2260  // restore state outside the region
2261  thread->th.ompt_thread_info.state =
2262      ((team->t.t_serialized) ? ompt_state_work_serial
2263                              : ompt_state_work_parallel);
2264}
2265
2266static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2267                                   kmp_team_t *team, ompt_data_t *parallel_data,
2268                                   int flags, void *codeptr) {
2269  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2270  if (ompt_enabled.ompt_callback_parallel_end) {
2271    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2272        parallel_data, &(task_info->task_data), flags, codeptr);
2273  }
2274
2275  task_info->frame.enter_frame = ompt_data_none;
2276  __kmp_join_restore_state(thread, team);
2277}
2278#endif
2279
2280void __kmp_join_call(ident_t *loc, int gtid
2281#if OMPT_SUPPORT
2282                     ,
2283                     enum fork_context_e fork_context
2284#endif
2285                     ,
2286                     int exit_teams) {
2287  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2288  kmp_team_t *team;
2289  kmp_team_t *parent_team;
2290  kmp_info_t *master_th;
2291  kmp_root_t *root;
2292  int master_active;
2293
2294  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2295
2296  /* setup current data */
2297  master_th = __kmp_threads[gtid];
2298  root = master_th->th.th_root;
2299  team = master_th->th.th_team;
2300  parent_team = team->t.t_parent;
2301
2302  master_th->th.th_ident = loc;
2303
2304#if OMPT_SUPPORT
2305  void *team_microtask = (void *)team->t.t_pkfn;
2306  if (ompt_enabled.enabled) {
2307    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2308  }
2309#endif
2310
2311#if KMP_DEBUG
2312  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2313    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2314                  "th_task_team = %p\n",
2315                  __kmp_gtid_from_thread(master_th), team,
2316                  team->t.t_task_team[master_th->th.th_task_state],
2317                  master_th->th.th_task_team));
2318    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2319                     team->t.t_task_team[master_th->th.th_task_state]);
2320  }
2321#endif
2322
2323  if (team->t.t_serialized) {
2324    if (master_th->th.th_teams_microtask) {
2325      // We are in teams construct
2326      int level = team->t.t_level;
2327      int tlevel = master_th->th.th_teams_level;
2328      if (level == tlevel) {
2329        // AC: we haven't incremented it earlier at start of teams construct,
2330        //     so do it here - at the end of teams construct
2331        team->t.t_level++;
2332      } else if (level == tlevel + 1) {
2333        // AC: we are exiting parallel inside teams, need to increment
2334        // serialization in order to restore it in the next call to
2335        // __kmpc_end_serialized_parallel
2336        team->t.t_serialized++;
2337      }
2338    }
2339    __kmpc_end_serialized_parallel(loc, gtid);
2340
2341#if OMPT_SUPPORT
2342    if (ompt_enabled.enabled) {
2343      __kmp_join_restore_state(master_th, parent_team);
2344    }
2345#endif
2346
2347    return;
2348  }
2349
2350  master_active = team->t.t_master_active;
2351
2352  if (!exit_teams) {
2353    // AC: No barrier for internal teams at exit from teams construct.
2354    //     But there is barrier for external team (league).
2355    __kmp_internal_join(loc, gtid, team);
2356  } else {
2357    master_th->th.th_task_state =
2358        0; // AC: no tasking in teams (out of any parallel)
2359  }
2360
2361  KMP_MB();
2362
2363#if OMPT_SUPPORT
2364  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2365  void *codeptr = team->t.ompt_team_info.master_return_address;
2366#endif
2367
2368#if USE_ITT_BUILD
2369  if (__itt_stack_caller_create_ptr) {
2370    __kmp_itt_stack_caller_destroy(
2371        (__itt_caller)team->t
2372            .t_stack_id); // destroy the stack stitching id after join barrier
2373  }
2374
2375  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2376  if (team->t.t_active_level == 1 &&
2377      !master_th->th.th_teams_microtask) { /* not in teams construct */
2378    master_th->th.th_ident = loc;
2379    // only one notification scheme (either "submit" or "forking/joined", not
2380    // both)
2381    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382        __kmp_forkjoin_frames_mode == 3)
2383      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2384                             master_th->th.th_frame_time, 0, loc,
2385                             master_th->th.th_team_nproc, 1);
2386    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2387             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2388      __kmp_itt_region_joined(gtid);
2389  } // active_level == 1
2390#endif /* USE_ITT_BUILD */
2391
2392  if (master_th->th.th_teams_microtask && !exit_teams &&
2393      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2394      team->t.t_level == master_th->th.th_teams_level + 1) {
2395// AC: We need to leave the team structure intact at the end of parallel
2396// inside the teams construct, so that at the next parallel same (hot) team
2397// works, only adjust nesting levels
2398#if OMPT_SUPPORT
2399    ompt_data_t ompt_parallel_data = ompt_data_none;
2400    if (ompt_enabled.enabled) {
2401      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2402      if (ompt_enabled.ompt_callback_implicit_task) {
2403        int ompt_team_size = team->t.t_nproc;
2404        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2405            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2406            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2407      }
2408      task_info->frame.exit_frame = ompt_data_none;
2409      task_info->task_data = ompt_data_none;
2410      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2411      __ompt_lw_taskteam_unlink(master_th);
2412    }
2413#endif
2414    /* Decrement our nested depth level */
2415    team->t.t_level--;
2416    team->t.t_active_level--;
2417    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2418
2419    // Restore number of threads in the team if needed. This code relies on
2420    // the proper adjustment of th_teams_size.nth after the fork in
2421    // __kmp_teams_master on each teams master in the case that
2422    // __kmp_reserve_threads reduced it.
2423    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2424      int old_num = master_th->th.th_team_nproc;
2425      int new_num = master_th->th.th_teams_size.nth;
2426      kmp_info_t **other_threads = team->t.t_threads;
2427      team->t.t_nproc = new_num;
2428      for (int i = 0; i < old_num; ++i) {
2429        other_threads[i]->th.th_team_nproc = new_num;
2430      }
2431      // Adjust states of non-used threads of the team
2432      for (int i = old_num; i < new_num; ++i) {
2433        // Re-initialize thread's barrier data.
2434        KMP_DEBUG_ASSERT(other_threads[i]);
2435        kmp_balign_t *balign = other_threads[i]->th.th_bar;
2436        for (int b = 0; b < bs_last_barrier; ++b) {
2437          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2438          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2439#if USE_DEBUGGER
2440          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2441#endif
2442        }
2443        if (__kmp_tasking_mode != tskm_immediate_exec) {
2444          // Synchronize thread's task state
2445          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2446        }
2447      }
2448    }
2449
2450#if OMPT_SUPPORT
2451    if (ompt_enabled.enabled) {
2452      __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2453                      OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2454    }
2455#endif
2456
2457    return;
2458  }
2459
2460  /* do cleanup and restore the parent team */
2461  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2462  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2463
2464  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2465
2466  /* jc: The following lock has instructions with REL and ACQ semantics,
2467     separating the parallel user code called in this parallel region
2468     from the serial user code called after this function returns. */
2469  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2470
2471  if (!master_th->th.th_teams_microtask ||
2472      team->t.t_level > master_th->th.th_teams_level) {
2473    /* Decrement our nested depth level */
2474    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2475  }
2476  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2477
2478#if OMPT_SUPPORT
2479  if (ompt_enabled.enabled) {
2480    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2481    if (ompt_enabled.ompt_callback_implicit_task) {
2482      int flags = (team_microtask == (void *)__kmp_teams_master)
2483                      ? ompt_task_initial
2484                      : ompt_task_implicit;
2485      int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2486      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2487          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2488          OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2489    }
2490    task_info->frame.exit_frame = ompt_data_none;
2491    task_info->task_data = ompt_data_none;
2492  }
2493#endif
2494
2495  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2496                master_th, team));
2497  __kmp_pop_current_task_from_thread(master_th);
2498
2499#if KMP_AFFINITY_SUPPORTED
2500  // Restore master thread's partition.
2501  master_th->th.th_first_place = team->t.t_first_place;
2502  master_th->th.th_last_place = team->t.t_last_place;
2503#endif // KMP_AFFINITY_SUPPORTED
2504  master_th->th.th_def_allocator = team->t.t_def_allocator;
2505
2506  updateHWFPControl(team);
2507
2508  if (root->r.r_active != master_active)
2509    root->r.r_active = master_active;
2510
2511  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2512                            master_th)); // this will free worker threads
2513
2514  /* this race was fun to find. make sure the following is in the critical
2515     region otherwise assertions may fail occasionally since the old team may be
2516     reallocated and the hierarchy appears inconsistent. it is actually safe to
2517     run and won't cause any bugs, but will cause those assertion failures. it's
2518     only one deref&assign so might as well put this in the critical region */
2519  master_th->th.th_team = parent_team;
2520  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2521  master_th->th.th_team_master = parent_team->t.t_threads[0];
2522  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2523
2524  /* restore serialized team, if need be */
2525  if (parent_team->t.t_serialized &&
2526      parent_team != master_th->th.th_serial_team &&
2527      parent_team != root->r.r_root_team) {
2528    __kmp_free_team(root,
2529                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2530    master_th->th.th_serial_team = parent_team;
2531  }
2532
2533  if (__kmp_tasking_mode != tskm_immediate_exec) {
2534    if (master_th->th.th_task_state_top >
2535        0) { // Restore task state from memo stack
2536      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2537      // Remember master's state if we re-use this nested hot team
2538      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2539          master_th->th.th_task_state;
2540      --master_th->th.th_task_state_top; // pop
2541      // Now restore state at this level
2542      master_th->th.th_task_state =
2543          master_th->th
2544              .th_task_state_memo_stack[master_th->th.th_task_state_top];
2545    }
2546    // Copy the task team from the parent team to the master thread
2547    master_th->th.th_task_team =
2548        parent_team->t.t_task_team[master_th->th.th_task_state];
2549    KA_TRACE(20,
2550             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2551              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2552              parent_team));
2553  }
2554
2555  // TODO: GEH - cannot do this assertion because root thread not set up as
2556  // executing
2557  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2558  master_th->th.th_current_task->td_flags.executing = 1;
2559
2560  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2561
2562#if OMPT_SUPPORT
2563  int flags =
2564      OMPT_INVOKER(fork_context) |
2565      ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2566                                                      : ompt_parallel_team);
2567  if (ompt_enabled.enabled) {
2568    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2569                    codeptr);
2570  }
2571#endif
2572
2573  KMP_MB();
2574  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2575}
2576
2577/* Check whether we should push an internal control record onto the
2578   serial team stack.  If so, do it.  */
2579void __kmp_save_internal_controls(kmp_info_t *thread) {
2580
2581  if (thread->th.th_team != thread->th.th_serial_team) {
2582    return;
2583  }
2584  if (thread->th.th_team->t.t_serialized > 1) {
2585    int push = 0;
2586
2587    if (thread->th.th_team->t.t_control_stack_top == NULL) {
2588      push = 1;
2589    } else {
2590      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2591          thread->th.th_team->t.t_serialized) {
2592        push = 1;
2593      }
2594    }
2595    if (push) { /* push a record on the serial team's stack */
2596      kmp_internal_control_t *control =
2597          (kmp_internal_control_t *)__kmp_allocate(
2598              sizeof(kmp_internal_control_t));
2599
2600      copy_icvs(control, &thread->th.th_current_task->td_icvs);
2601
2602      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2603
2604      control->next = thread->th.th_team->t.t_control_stack_top;
2605      thread->th.th_team->t.t_control_stack_top = control;
2606    }
2607  }
2608}
2609
2610/* Changes set_nproc */
2611void __kmp_set_num_threads(int new_nth, int gtid) {
2612  kmp_info_t *thread;
2613  kmp_root_t *root;
2614
2615  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2616  KMP_DEBUG_ASSERT(__kmp_init_serial);
2617
2618  if (new_nth < 1)
2619    new_nth = 1;
2620  else if (new_nth > __kmp_max_nth)
2621    new_nth = __kmp_max_nth;
2622
2623  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2624  thread = __kmp_threads[gtid];
2625  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2626    return; // nothing to do
2627
2628  __kmp_save_internal_controls(thread);
2629
2630  set__nproc(thread, new_nth);
2631
2632  // If this omp_set_num_threads() call will cause the hot team size to be
2633  // reduced (in the absence of a num_threads clause), then reduce it now,
2634  // rather than waiting for the next parallel region.
2635  root = thread->th.th_root;
2636  if (__kmp_init_parallel && (!root->r.r_active) &&
2637      (root->r.r_hot_team->t.t_nproc > new_nth)
2638#if KMP_NESTED_HOT_TEAMS
2639      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2640#endif
2641      ) {
2642    kmp_team_t *hot_team = root->r.r_hot_team;
2643    int f;
2644
2645    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2646
2647    // Release the extra threads we don't need any more.
2648    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2649      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2650      if (__kmp_tasking_mode != tskm_immediate_exec) {
2651        // When decreasing team size, threads no longer in the team should unref
2652        // task team.
2653        hot_team->t.t_threads[f]->th.th_task_team = NULL;
2654      }
2655      __kmp_free_thread(hot_team->t.t_threads[f]);
2656      hot_team->t.t_threads[f] = NULL;
2657    }
2658    hot_team->t.t_nproc = new_nth;
2659#if KMP_NESTED_HOT_TEAMS
2660    if (thread->th.th_hot_teams) {
2661      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2662      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2663    }
2664#endif
2665
2666    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2667
2668    // Update the t_nproc field in the threads that are still active.
2669    for (f = 0; f < new_nth; f++) {
2670      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2671      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2672    }
2673    // Special flag in case omp_set_num_threads() call
2674    hot_team->t.t_size_changed = -1;
2675  }
2676}
2677
2678/* Changes max_active_levels */
2679void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2680  kmp_info_t *thread;
2681
2682  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2683                "%d = (%d)\n",
2684                gtid, max_active_levels));
2685  KMP_DEBUG_ASSERT(__kmp_init_serial);
2686
2687  // validate max_active_levels
2688  if (max_active_levels < 0) {
2689    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2690    // We ignore this call if the user has specified a negative value.
2691    // The current setting won't be changed. The last valid setting will be
2692    // used. A warning will be issued (if warnings are allowed as controlled by
2693    // the KMP_WARNINGS env var).
2694    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2695                  "max_active_levels for thread %d = (%d)\n",
2696                  gtid, max_active_levels));
2697    return;
2698  }
2699  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2700    // it's OK, the max_active_levels is within the valid range: [ 0;
2701    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2702    // We allow a zero value. (implementation defined behavior)
2703  } else {
2704    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2705                KMP_MAX_ACTIVE_LEVELS_LIMIT);
2706    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2707    // Current upper limit is MAX_INT. (implementation defined behavior)
2708    // If the input exceeds the upper limit, we correct the input to be the
2709    // upper limit. (implementation defined behavior)
2710    // Actually, the flow should never get here until we use MAX_INT limit.
2711  }
2712  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2713                "max_active_levels for thread %d = (%d)\n",
2714                gtid, max_active_levels));
2715
2716  thread = __kmp_threads[gtid];
2717
2718  __kmp_save_internal_controls(thread);
2719
2720  set__max_active_levels(thread, max_active_levels);
2721}
2722
2723/* Gets max_active_levels */
2724int __kmp_get_max_active_levels(int gtid) {
2725  kmp_info_t *thread;
2726
2727  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2728  KMP_DEBUG_ASSERT(__kmp_init_serial);
2729
2730  thread = __kmp_threads[gtid];
2731  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2732  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2733                "curtask_maxaclevel=%d\n",
2734                gtid, thread->th.th_current_task,
2735                thread->th.th_current_task->td_icvs.max_active_levels));
2736  return thread->th.th_current_task->td_icvs.max_active_levels;
2737}
2738
2739KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2740KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2741
2742/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2743void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2744  kmp_info_t *thread;
2745  kmp_sched_t orig_kind;
2746  //    kmp_team_t *team;
2747
2748  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2749                gtid, (int)kind, chunk));
2750  KMP_DEBUG_ASSERT(__kmp_init_serial);
2751
2752  // Check if the kind parameter is valid, correct if needed.
2753  // Valid parameters should fit in one of two intervals - standard or extended:
2754  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2755  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2756  orig_kind = kind;
2757  kind = __kmp_sched_without_mods(kind);
2758
2759  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2760      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2761    // TODO: Hint needs attention in case we change the default schedule.
2762    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2763              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2764              __kmp_msg_null);
2765    kind = kmp_sched_default;
2766    chunk = 0; // ignore chunk value in case of bad kind
2767  }
2768
2769  thread = __kmp_threads[gtid];
2770
2771  __kmp_save_internal_controls(thread);
2772
2773  if (kind < kmp_sched_upper_std) {
2774    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2775      // differ static chunked vs. unchunked:  chunk should be invalid to
2776      // indicate unchunked schedule (which is the default)
2777      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2778    } else {
2779      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2780          __kmp_sch_map[kind - kmp_sched_lower - 1];
2781    }
2782  } else {
2783    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2784    //    kmp_sched_lower - 2 ];
2785    thread->th.th_current_task->td_icvs.sched.r_sched_type =
2786        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2787                      kmp_sched_lower - 2];
2788  }
2789  __kmp_sched_apply_mods_intkind(
2790      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2791  if (kind == kmp_sched_auto || chunk < 1) {
2792    // ignore parameter chunk for schedule auto
2793    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2794  } else {
2795    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2796  }
2797}
2798
2799/* Gets def_sched_var ICV values */
2800void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2801  kmp_info_t *thread;
2802  enum sched_type th_type;
2803
2804  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2805  KMP_DEBUG_ASSERT(__kmp_init_serial);
2806
2807  thread = __kmp_threads[gtid];
2808
2809  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2810  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2811  case kmp_sch_static:
2812  case kmp_sch_static_greedy:
2813  case kmp_sch_static_balanced:
2814    *kind = kmp_sched_static;
2815    __kmp_sched_apply_mods_stdkind(kind, th_type);
2816    *chunk = 0; // chunk was not set, try to show this fact via zero value
2817    return;
2818  case kmp_sch_static_chunked:
2819    *kind = kmp_sched_static;
2820    break;
2821  case kmp_sch_dynamic_chunked:
2822    *kind = kmp_sched_dynamic;
2823    break;
2824  case kmp_sch_guided_chunked:
2825  case kmp_sch_guided_iterative_chunked:
2826  case kmp_sch_guided_analytical_chunked:
2827    *kind = kmp_sched_guided;
2828    break;
2829  case kmp_sch_auto:
2830    *kind = kmp_sched_auto;
2831    break;
2832  case kmp_sch_trapezoidal:
2833    *kind = kmp_sched_trapezoidal;
2834    break;
2835#if KMP_STATIC_STEAL_ENABLED
2836  case kmp_sch_static_steal:
2837    *kind = kmp_sched_static_steal;
2838    break;
2839#endif
2840  default:
2841    KMP_FATAL(UnknownSchedulingType, th_type);
2842  }
2843
2844  __kmp_sched_apply_mods_stdkind(kind, th_type);
2845  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2846}
2847
2848int __kmp_get_ancestor_thread_num(int gtid, int level) {
2849
2850  int ii, dd;
2851  kmp_team_t *team;
2852  kmp_info_t *thr;
2853
2854  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2855  KMP_DEBUG_ASSERT(__kmp_init_serial);
2856
2857  // validate level
2858  if (level == 0)
2859    return 0;
2860  if (level < 0)
2861    return -1;
2862  thr = __kmp_threads[gtid];
2863  team = thr->th.th_team;
2864  ii = team->t.t_level;
2865  if (level > ii)
2866    return -1;
2867
2868  if (thr->th.th_teams_microtask) {
2869    // AC: we are in teams region where multiple nested teams have same level
2870    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2871    if (level <=
2872        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2873      KMP_DEBUG_ASSERT(ii >= tlevel);
2874      // AC: As we need to pass by the teams league, we need to artificially
2875      // increase ii
2876      if (ii == tlevel) {
2877        ii += 2; // three teams have same level
2878      } else {
2879        ii++; // two teams have same level
2880      }
2881    }
2882  }
2883
2884  if (ii == level)
2885    return __kmp_tid_from_gtid(gtid);
2886
2887  dd = team->t.t_serialized;
2888  level++;
2889  while (ii > level) {
2890    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2891    }
2892    if ((team->t.t_serialized) && (!dd)) {
2893      team = team->t.t_parent;
2894      continue;
2895    }
2896    if (ii > level) {
2897      team = team->t.t_parent;
2898      dd = team->t.t_serialized;
2899      ii--;
2900    }
2901  }
2902
2903  return (dd > 1) ? (0) : (team->t.t_master_tid);
2904}
2905
2906int __kmp_get_team_size(int gtid, int level) {
2907
2908  int ii, dd;
2909  kmp_team_t *team;
2910  kmp_info_t *thr;
2911
2912  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2913  KMP_DEBUG_ASSERT(__kmp_init_serial);
2914
2915  // validate level
2916  if (level == 0)
2917    return 1;
2918  if (level < 0)
2919    return -1;
2920  thr = __kmp_threads[gtid];
2921  team = thr->th.th_team;
2922  ii = team->t.t_level;
2923  if (level > ii)
2924    return -1;
2925
2926  if (thr->th.th_teams_microtask) {
2927    // AC: we are in teams region where multiple nested teams have same level
2928    int tlevel = thr->th.th_teams_level; // the level of the teams construct
2929    if (level <=
2930        tlevel) { // otherwise usual algorithm works (will not touch the teams)
2931      KMP_DEBUG_ASSERT(ii >= tlevel);
2932      // AC: As we need to pass by the teams league, we need to artificially
2933      // increase ii
2934      if (ii == tlevel) {
2935        ii += 2; // three teams have same level
2936      } else {
2937        ii++; // two teams have same level
2938      }
2939    }
2940  }
2941
2942  while (ii > level) {
2943    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2944    }
2945    if (team->t.t_serialized && (!dd)) {
2946      team = team->t.t_parent;
2947      continue;
2948    }
2949    if (ii > level) {
2950      team = team->t.t_parent;
2951      ii--;
2952    }
2953  }
2954
2955  return team->t.t_nproc;
2956}
2957
2958kmp_r_sched_t __kmp_get_schedule_global() {
2959  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2960  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2961  // independently. So one can get the updated schedule here.
2962
2963  kmp_r_sched_t r_sched;
2964
2965  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2966  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2967  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2968  // different roots (even in OMP 2.5)
2969  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2970  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2971  if (s == kmp_sch_static) {
2972    // replace STATIC with more detailed schedule (balanced or greedy)
2973    r_sched.r_sched_type = __kmp_static;
2974  } else if (s == kmp_sch_guided_chunked) {
2975    // replace GUIDED with more detailed schedule (iterative or analytical)
2976    r_sched.r_sched_type = __kmp_guided;
2977  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2978    r_sched.r_sched_type = __kmp_sched;
2979  }
2980  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2981
2982  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2983    // __kmp_chunk may be wrong here (if it was not ever set)
2984    r_sched.chunk = KMP_DEFAULT_CHUNK;
2985  } else {
2986    r_sched.chunk = __kmp_chunk;
2987  }
2988
2989  return r_sched;
2990}
2991
2992/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2993   at least argc number of *t_argv entries for the requested team. */
2994static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2995
2996  KMP_DEBUG_ASSERT(team);
2997  if (!realloc || argc > team->t.t_max_argc) {
2998
2999    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3000                   "current entries=%d\n",
3001                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3002    /* if previously allocated heap space for args, free them */
3003    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3004      __kmp_free((void *)team->t.t_argv);
3005
3006    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3007      /* use unused space in the cache line for arguments */
3008      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3009      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3010                     "argv entries\n",
3011                     team->t.t_id, team->t.t_max_argc));
3012      team->t.t_argv = &team->t.t_inline_argv[0];
3013      if (__kmp_storage_map) {
3014        __kmp_print_storage_map_gtid(
3015            -1, &team->t.t_inline_argv[0],
3016            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3017            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3018            team->t.t_id);
3019      }
3020    } else {
3021      /* allocate space for arguments in the heap */
3022      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3023                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
3024                               : 2 * argc;
3025      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3026                     "argv entries\n",
3027                     team->t.t_id, team->t.t_max_argc));
3028      team->t.t_argv =
3029          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3030      if (__kmp_storage_map) {
3031        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3032                                     &team->t.t_argv[team->t.t_max_argc],
3033                                     sizeof(void *) * team->t.t_max_argc,
3034                                     "team_%d.t_argv", team->t.t_id);
3035      }
3036    }
3037  }
3038}
3039
3040static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3041  int i;
3042  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3043  team->t.t_threads =
3044      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3045  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3046      sizeof(dispatch_shared_info_t) * num_disp_buff);
3047  team->t.t_dispatch =
3048      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3049  team->t.t_implicit_task_taskdata =
3050      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3051  team->t.t_max_nproc = max_nth;
3052
3053  /* setup dispatch buffers */
3054  for (i = 0; i < num_disp_buff; ++i) {
3055    team->t.t_disp_buffer[i].buffer_index = i;
3056    team->t.t_disp_buffer[i].doacross_buf_idx = i;
3057  }
3058}
3059
3060static void __kmp_free_team_arrays(kmp_team_t *team) {
3061  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3062  int i;
3063  for (i = 0; i < team->t.t_max_nproc; ++i) {
3064    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3065      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3066      team->t.t_dispatch[i].th_disp_buffer = NULL;
3067    }
3068  }
3069#if KMP_USE_HIER_SCHED
3070  __kmp_dispatch_free_hierarchies(team);
3071#endif
3072  __kmp_free(team->t.t_threads);
3073  __kmp_free(team->t.t_disp_buffer);
3074  __kmp_free(team->t.t_dispatch);
3075  __kmp_free(team->t.t_implicit_task_taskdata);
3076  team->t.t_threads = NULL;
3077  team->t.t_disp_buffer = NULL;
3078  team->t.t_dispatch = NULL;
3079  team->t.t_implicit_task_taskdata = 0;
3080}
3081
3082static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3083  kmp_info_t **oldThreads = team->t.t_threads;
3084
3085  __kmp_free(team->t.t_disp_buffer);
3086  __kmp_free(team->t.t_dispatch);
3087  __kmp_free(team->t.t_implicit_task_taskdata);
3088  __kmp_allocate_team_arrays(team, max_nth);
3089
3090  KMP_MEMCPY(team->t.t_threads, oldThreads,
3091             team->t.t_nproc * sizeof(kmp_info_t *));
3092
3093  __kmp_free(oldThreads);
3094}
3095
3096static kmp_internal_control_t __kmp_get_global_icvs(void) {
3097
3098  kmp_r_sched_t r_sched =
3099      __kmp_get_schedule_global(); // get current state of scheduling globals
3100
3101  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3102
3103  kmp_internal_control_t g_icvs = {
3104    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3105    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3106    // adjustment of threads (per thread)
3107    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3108    // whether blocktime is explicitly set
3109    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3110#if KMP_USE_MONITOR
3111    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3112// intervals
3113#endif
3114    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3115    // next parallel region (per thread)
3116    // (use a max ub on value if __kmp_parallel_initialize not called yet)
3117    __kmp_cg_max_nth, // int thread_limit;
3118    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3119    // for max_active_levels
3120    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3121    // {sched,chunk} pair
3122    __kmp_nested_proc_bind.bind_types[0],
3123    __kmp_default_device,
3124    NULL // struct kmp_internal_control *next;
3125  };
3126
3127  return g_icvs;
3128}
3129
3130static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3131
3132  kmp_internal_control_t gx_icvs;
3133  gx_icvs.serial_nesting_level =
3134      0; // probably =team->t.t_serial like in save_inter_controls
3135  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3136  gx_icvs.next = NULL;
3137
3138  return gx_icvs;
3139}
3140
3141static void __kmp_initialize_root(kmp_root_t *root) {
3142  int f;
3143  kmp_team_t *root_team;
3144  kmp_team_t *hot_team;
3145  int hot_team_max_nth;
3146  kmp_r_sched_t r_sched =
3147      __kmp_get_schedule_global(); // get current state of scheduling globals
3148  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3149  KMP_DEBUG_ASSERT(root);
3150  KMP_ASSERT(!root->r.r_begin);
3151
3152  /* setup the root state structure */
3153  __kmp_init_lock(&root->r.r_begin_lock);
3154  root->r.r_begin = FALSE;
3155  root->r.r_active = FALSE;
3156  root->r.r_in_parallel = 0;
3157  root->r.r_blocktime = __kmp_dflt_blocktime;
3158
3159  /* setup the root team for this task */
3160  /* allocate the root team structure */
3161  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3162
3163  root_team =
3164      __kmp_allocate_team(root,
3165                          1, // new_nproc
3166                          1, // max_nproc
3167#if OMPT_SUPPORT
3168                          ompt_data_none, // root parallel id
3169#endif
3170                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3171                          0 // argc
3172                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3173                          );
3174#if USE_DEBUGGER
3175  // Non-NULL value should be assigned to make the debugger display the root
3176  // team.
3177  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3178#endif
3179
3180  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3181
3182  root->r.r_root_team = root_team;
3183  root_team->t.t_control_stack_top = NULL;
3184
3185  /* initialize root team */
3186  root_team->t.t_threads[0] = NULL;
3187  root_team->t.t_nproc = 1;
3188  root_team->t.t_serialized = 1;
3189  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3190  root_team->t.t_sched.sched = r_sched.sched;
3191  KA_TRACE(
3192      20,
3193      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3194       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3195
3196  /* setup the  hot team for this task */
3197  /* allocate the hot team structure */
3198  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3199
3200  hot_team =
3201      __kmp_allocate_team(root,
3202                          1, // new_nproc
3203                          __kmp_dflt_team_nth_ub * 2, // max_nproc
3204#if OMPT_SUPPORT
3205                          ompt_data_none, // root parallel id
3206#endif
3207                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3208                          0 // argc
3209                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3210                          );
3211  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3212
3213  root->r.r_hot_team = hot_team;
3214  root_team->t.t_control_stack_top = NULL;
3215
3216  /* first-time initialization */
3217  hot_team->t.t_parent = root_team;
3218
3219  /* initialize hot team */
3220  hot_team_max_nth = hot_team->t.t_max_nproc;
3221  for (f = 0; f < hot_team_max_nth; ++f) {
3222    hot_team->t.t_threads[f] = NULL;
3223  }
3224  hot_team->t.t_nproc = 1;
3225  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3226  hot_team->t.t_sched.sched = r_sched.sched;
3227  hot_team->t.t_size_changed = 0;
3228}
3229
3230#ifdef KMP_DEBUG
3231
3232typedef struct kmp_team_list_item {
3233  kmp_team_p const *entry;
3234  struct kmp_team_list_item *next;
3235} kmp_team_list_item_t;
3236typedef kmp_team_list_item_t *kmp_team_list_t;
3237
3238static void __kmp_print_structure_team_accum( // Add team to list of teams.
3239    kmp_team_list_t list, // List of teams.
3240    kmp_team_p const *team // Team to add.
3241    ) {
3242
3243  // List must terminate with item where both entry and next are NULL.
3244  // Team is added to the list only once.
3245  // List is sorted in ascending order by team id.
3246  // Team id is *not* a key.
3247
3248  kmp_team_list_t l;
3249
3250  KMP_DEBUG_ASSERT(list != NULL);
3251  if (team == NULL) {
3252    return;
3253  }
3254
3255  __kmp_print_structure_team_accum(list, team->t.t_parent);
3256  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3257
3258  // Search list for the team.
3259  l = list;
3260  while (l->next != NULL && l->entry != team) {
3261    l = l->next;
3262  }
3263  if (l->next != NULL) {
3264    return; // Team has been added before, exit.
3265  }
3266
3267  // Team is not found. Search list again for insertion point.
3268  l = list;
3269  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3270    l = l->next;
3271  }
3272
3273  // Insert team.
3274  {
3275    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3276        sizeof(kmp_team_list_item_t));
3277    *item = *l;
3278    l->entry = team;
3279    l->next = item;
3280  }
3281}
3282
3283static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3284
3285                                       ) {
3286  __kmp_printf("%s", title);
3287  if (team != NULL) {
3288    __kmp_printf("%2x %p\n", team->t.t_id, team);
3289  } else {
3290    __kmp_printf(" - (nil)\n");
3291  }
3292}
3293
3294static void __kmp_print_structure_thread(char const *title,
3295                                         kmp_info_p const *thread) {
3296  __kmp_printf("%s", title);
3297  if (thread != NULL) {
3298    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3299  } else {
3300    __kmp_printf(" - (nil)\n");
3301  }
3302}
3303
3304void __kmp_print_structure(void) {
3305
3306  kmp_team_list_t list;
3307
3308  // Initialize list of teams.
3309  list =
3310      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3311  list->entry = NULL;
3312  list->next = NULL;
3313
3314  __kmp_printf("\n------------------------------\nGlobal Thread "
3315               "Table\n------------------------------\n");
3316  {
3317    int gtid;
3318    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3319      __kmp_printf("%2d", gtid);
3320      if (__kmp_threads != NULL) {
3321        __kmp_printf(" %p", __kmp_threads[gtid]);
3322      }
3323      if (__kmp_root != NULL) {
3324        __kmp_printf(" %p", __kmp_root[gtid]);
3325      }
3326      __kmp_printf("\n");
3327    }
3328  }
3329
3330  // Print out __kmp_threads array.
3331  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3332               "----------\n");
3333  if (__kmp_threads != NULL) {
3334    int gtid;
3335    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3336      kmp_info_t const *thread = __kmp_threads[gtid];
3337      if (thread != NULL) {
3338        __kmp_printf("GTID %2d %p:\n", gtid, thread);
3339        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3340        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3341        __kmp_print_structure_team("    Serial Team:  ",
3342                                   thread->th.th_serial_team);
3343        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3344        __kmp_print_structure_thread("    Master:       ",
3345                                     thread->th.th_team_master);
3346        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3347        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3348        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3349        __kmp_print_structure_thread("    Next in pool: ",
3350                                     thread->th.th_next_pool);
3351        __kmp_printf("\n");
3352        __kmp_print_structure_team_accum(list, thread->th.th_team);
3353        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3354      }
3355    }
3356  } else {
3357    __kmp_printf("Threads array is not allocated.\n");
3358  }
3359
3360  // Print out __kmp_root array.
3361  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3362               "--------\n");
3363  if (__kmp_root != NULL) {
3364    int gtid;
3365    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3366      kmp_root_t const *root = __kmp_root[gtid];
3367      if (root != NULL) {
3368        __kmp_printf("GTID %2d %p:\n", gtid, root);
3369        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3370        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3371        __kmp_print_structure_thread("    Uber Thread:  ",
3372                                     root->r.r_uber_thread);
3373        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3374        __kmp_printf("    In Parallel:  %2d\n",
3375                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3376        __kmp_printf("\n");
3377        __kmp_print_structure_team_accum(list, root->r.r_root_team);
3378        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3379      }
3380    }
3381  } else {
3382    __kmp_printf("Ubers array is not allocated.\n");
3383  }
3384
3385  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3386               "--------\n");
3387  while (list->next != NULL) {
3388    kmp_team_p const *team = list->entry;
3389    int i;
3390    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3391    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3392    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
3393    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3394    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3395    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3396    for (i = 0; i < team->t.t_nproc; ++i) {
3397      __kmp_printf("    Thread %2d:      ", i);
3398      __kmp_print_structure_thread("", team->t.t_threads[i]);
3399    }
3400    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3401    __kmp_printf("\n");
3402    list = list->next;
3403  }
3404
3405  // Print out __kmp_thread_pool and __kmp_team_pool.
3406  __kmp_printf("\n------------------------------\nPools\n----------------------"
3407               "--------\n");
3408  __kmp_print_structure_thread("Thread pool:          ",
3409                               CCAST(kmp_info_t *, __kmp_thread_pool));
3410  __kmp_print_structure_team("Team pool:            ",
3411                             CCAST(kmp_team_t *, __kmp_team_pool));
3412  __kmp_printf("\n");
3413
3414  // Free team list.
3415  while (list != NULL) {
3416    kmp_team_list_item_t *item = list;
3417    list = list->next;
3418    KMP_INTERNAL_FREE(item);
3419  }
3420}
3421
3422#endif
3423
3424//---------------------------------------------------------------------------
3425//  Stuff for per-thread fast random number generator
3426//  Table of primes
3427static const unsigned __kmp_primes[] = {
3428    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3429    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3430    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3431    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3432    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3433    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3434    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3435    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3436    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3437    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3438    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3439
3440//---------------------------------------------------------------------------
3441//  __kmp_get_random: Get a random number using a linear congruential method.
3442unsigned short __kmp_get_random(kmp_info_t *thread) {
3443  unsigned x = thread->th.th_x;
3444  unsigned short r = x >> 16;
3445
3446  thread->th.th_x = x * thread->th.th_a + 1;
3447
3448  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3449                thread->th.th_info.ds.ds_tid, r));
3450
3451  return r;
3452}
3453//--------------------------------------------------------
3454// __kmp_init_random: Initialize a random number generator
3455void __kmp_init_random(kmp_info_t *thread) {
3456  unsigned seed = thread->th.th_info.ds.ds_tid;
3457
3458  thread->th.th_a =
3459      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3460  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3461  KA_TRACE(30,
3462           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3463}
3464
3465#if KMP_OS_WINDOWS
3466/* reclaim array entries for root threads that are already dead, returns number
3467 * reclaimed */
3468static int __kmp_reclaim_dead_roots(void) {
3469  int i, r = 0;
3470
3471  for (i = 0; i < __kmp_threads_capacity; ++i) {
3472    if (KMP_UBER_GTID(i) &&
3473        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3474        !__kmp_root[i]
3475             ->r.r_active) { // AC: reclaim only roots died in non-active state
3476      r += __kmp_unregister_root_other_thread(i);
3477    }
3478  }
3479  return r;
3480}
3481#endif
3482
3483/* This function attempts to create free entries in __kmp_threads and
3484   __kmp_root, and returns the number of free entries generated.
3485
3486   For Windows* OS static library, the first mechanism used is to reclaim array
3487   entries for root threads that are already dead.
3488
3489   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3490   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3491   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3492   threadprivate cache array has been created. Synchronization with
3493   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3494
3495   After any dead root reclamation, if the clipping value allows array expansion
3496   to result in the generation of a total of nNeed free slots, the function does
3497   that expansion. If not, nothing is done beyond the possible initial root
3498   thread reclamation.
3499
3500   If any argument is negative, the behavior is undefined. */
3501static int __kmp_expand_threads(int nNeed) {
3502  int added = 0;
3503  int minimumRequiredCapacity;
3504  int newCapacity;
3505  kmp_info_t **newThreads;
3506  kmp_root_t **newRoot;
3507
3508// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3509// resizing __kmp_threads does not need additional protection if foreign
3510// threads are present
3511
3512#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3513  /* only for Windows static library */
3514  /* reclaim array entries for root threads that are already dead */
3515  added = __kmp_reclaim_dead_roots();
3516
3517  if (nNeed) {
3518    nNeed -= added;
3519    if (nNeed < 0)
3520      nNeed = 0;
3521  }
3522#endif
3523  if (nNeed <= 0)
3524    return added;
3525
3526  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3527  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3528  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3529  // > __kmp_max_nth in one of two ways:
3530  //
3531  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3532  //    may not be resused by another thread, so we may need to increase
3533  //    __kmp_threads_capacity to __kmp_max_nth + 1.
3534  //
3535  // 2) New foreign root(s) are encountered.  We always register new foreign
3536  //    roots. This may cause a smaller # of threads to be allocated at
3537  //    subsequent parallel regions, but the worker threads hang around (and
3538  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3539  //
3540  // Anyway, that is the reason for moving the check to see if
3541  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3542  // instead of having it performed here. -BB
3543
3544  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3545
3546  /* compute expansion headroom to check if we can expand */
3547  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3548    /* possible expansion too small -- give up */
3549    return added;
3550  }
3551  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3552
3553  newCapacity = __kmp_threads_capacity;
3554  do {
3555    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3556                                                          : __kmp_sys_max_nth;
3557  } while (newCapacity < minimumRequiredCapacity);
3558  newThreads = (kmp_info_t **)__kmp_allocate(
3559      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3560  newRoot =
3561      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3562  KMP_MEMCPY(newThreads, __kmp_threads,
3563             __kmp_threads_capacity * sizeof(kmp_info_t *));
3564  KMP_MEMCPY(newRoot, __kmp_root,
3565             __kmp_threads_capacity * sizeof(kmp_root_t *));
3566
3567  kmp_info_t **temp_threads = __kmp_threads;
3568  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3569  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3570  __kmp_free(temp_threads);
3571  added += newCapacity - __kmp_threads_capacity;
3572  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3573
3574  if (newCapacity > __kmp_tp_capacity) {
3575    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3576    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3577      __kmp_threadprivate_resize_cache(newCapacity);
3578    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3579      *(volatile int *)&__kmp_tp_capacity = newCapacity;
3580    }
3581    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3582  }
3583
3584  return added;
3585}
3586
3587/* Register the current thread as a root thread and obtain our gtid. We must
3588   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3589   thread that calls from __kmp_do_serial_initialize() */
3590int __kmp_register_root(int initial_thread) {
3591  kmp_info_t *root_thread;
3592  kmp_root_t *root;
3593  int gtid;
3594  int capacity;
3595  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3596  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3597  KMP_MB();
3598
3599  /* 2007-03-02:
3600     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3601     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3602     work as expected -- it may return false (that means there is at least one
3603     empty slot in __kmp_threads array), but it is possible the only free slot
3604     is #0, which is reserved for initial thread and so cannot be used for this
3605     one. Following code workarounds this bug.
3606
3607     However, right solution seems to be not reserving slot #0 for initial
3608     thread because:
3609     (1) there is no magic in slot #0,
3610     (2) we cannot detect initial thread reliably (the first thread which does
3611        serial initialization may be not a real initial thread).
3612  */
3613  capacity = __kmp_threads_capacity;
3614  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3615    --capacity;
3616  }
3617
3618  /* see if there are too many threads */
3619  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3620    if (__kmp_tp_cached) {
3621      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3622                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3623                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3624    } else {
3625      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3626                  __kmp_msg_null);
3627    }
3628  }
3629
3630  /* find an available thread slot */
3631  /* Don't reassign the zero slot since we need that to only be used by initial
3632     thread */
3633  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3634       gtid++)
3635    ;
3636  KA_TRACE(1,
3637           ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3638  KMP_ASSERT(gtid < __kmp_threads_capacity);
3639
3640  /* update global accounting */
3641  __kmp_all_nth++;
3642  TCW_4(__kmp_nth, __kmp_nth + 1);
3643
3644  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3645  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3646  if (__kmp_adjust_gtid_mode) {
3647    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3648      if (TCR_4(__kmp_gtid_mode) != 2) {
3649        TCW_4(__kmp_gtid_mode, 2);
3650      }
3651    } else {
3652      if (TCR_4(__kmp_gtid_mode) != 1) {
3653        TCW_4(__kmp_gtid_mode, 1);
3654      }
3655    }
3656  }
3657
3658#ifdef KMP_ADJUST_BLOCKTIME
3659  /* Adjust blocktime to zero if necessary            */
3660  /* Middle initialization might not have occurred yet */
3661  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3662    if (__kmp_nth > __kmp_avail_proc) {
3663      __kmp_zero_bt = TRUE;
3664    }
3665  }
3666#endif /* KMP_ADJUST_BLOCKTIME */
3667
3668  /* setup this new hierarchy */
3669  if (!(root = __kmp_root[gtid])) {
3670    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3671    KMP_DEBUG_ASSERT(!root->r.r_root_team);
3672  }
3673
3674#if KMP_STATS_ENABLED
3675  // Initialize stats as soon as possible (right after gtid assignment).
3676  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3677  __kmp_stats_thread_ptr->startLife();
3678  KMP_SET_THREAD_STATE(SERIAL_REGION);
3679  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3680#endif
3681  __kmp_initialize_root(root);
3682
3683  /* setup new root thread structure */
3684  if (root->r.r_uber_thread) {
3685    root_thread = root->r.r_uber_thread;
3686  } else {
3687    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3688    if (__kmp_storage_map) {
3689      __kmp_print_thread_storage_map(root_thread, gtid);
3690    }
3691    root_thread->th.th_info.ds.ds_gtid = gtid;
3692#if OMPT_SUPPORT
3693    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3694#endif
3695    root_thread->th.th_root = root;
3696    if (__kmp_env_consistency_check) {
3697      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3698    }
3699#if USE_FAST_MEMORY
3700    __kmp_initialize_fast_memory(root_thread);
3701#endif /* USE_FAST_MEMORY */
3702
3703#if KMP_USE_BGET
3704    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3705    __kmp_initialize_bget(root_thread);
3706#endif
3707    __kmp_init_random(root_thread); // Initialize random number generator
3708  }
3709
3710  /* setup the serial team held in reserve by the root thread */
3711  if (!root_thread->th.th_serial_team) {
3712    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3713    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3714    root_thread->th.th_serial_team = __kmp_allocate_team(
3715        root, 1, 1,
3716#if OMPT_SUPPORT
3717        ompt_data_none, // root parallel id
3718#endif
3719        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3720  }
3721  KMP_ASSERT(root_thread->th.th_serial_team);
3722  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3723                root_thread->th.th_serial_team));
3724
3725  /* drop root_thread into place */
3726  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3727
3728  root->r.r_root_team->t.t_threads[0] = root_thread;
3729  root->r.r_hot_team->t.t_threads[0] = root_thread;
3730  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3731  // AC: the team created in reserve, not for execution (it is unused for now).
3732  root_thread->th.th_serial_team->t.t_serialized = 0;
3733  root->r.r_uber_thread = root_thread;
3734
3735  /* initialize the thread, get it ready to go */
3736  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3737  TCW_4(__kmp_init_gtid, TRUE);
3738
3739  /* prepare the master thread for get_gtid() */
3740  __kmp_gtid_set_specific(gtid);
3741
3742#if USE_ITT_BUILD
3743  __kmp_itt_thread_name(gtid);
3744#endif /* USE_ITT_BUILD */
3745
3746#ifdef KMP_TDATA_GTID
3747  __kmp_gtid = gtid;
3748#endif
3749  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3750  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3751
3752  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3753                "plain=%u\n",
3754                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3755                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3756                KMP_INIT_BARRIER_STATE));
3757  { // Initialize barrier data.
3758    int b;
3759    for (b = 0; b < bs_last_barrier; ++b) {
3760      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3761#if USE_DEBUGGER
3762      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3763#endif
3764    }
3765  }
3766  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3767                   KMP_INIT_BARRIER_STATE);
3768
3769#if KMP_AFFINITY_SUPPORTED
3770  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3771  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3772  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3773  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3774  if (TCR_4(__kmp_init_middle)) {
3775    __kmp_affinity_set_init_mask(gtid, TRUE);
3776  }
3777#endif /* KMP_AFFINITY_SUPPORTED */
3778  root_thread->th.th_def_allocator = __kmp_def_allocator;
3779  root_thread->th.th_prev_level = 0;
3780  root_thread->th.th_prev_num_threads = 1;
3781
3782  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3783  tmp->cg_root = root_thread;
3784  tmp->cg_thread_limit = __kmp_cg_max_nth;
3785  tmp->cg_nthreads = 1;
3786  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3787                 " cg_nthreads init to 1\n",
3788                 root_thread, tmp));
3789  tmp->up = NULL;
3790  root_thread->th.th_cg_roots = tmp;
3791
3792  __kmp_root_counter++;
3793
3794#if OMPT_SUPPORT
3795  if (!initial_thread && ompt_enabled.enabled) {
3796
3797    kmp_info_t *root_thread = ompt_get_thread();
3798
3799    ompt_set_thread_state(root_thread, ompt_state_overhead);
3800
3801    if (ompt_enabled.ompt_callback_thread_begin) {
3802      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3803          ompt_thread_initial, __ompt_get_thread_data_internal());
3804    }
3805    ompt_data_t *task_data;
3806    ompt_data_t *parallel_data;
3807    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3808    if (ompt_enabled.ompt_callback_implicit_task) {
3809      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3810          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3811    }
3812
3813    ompt_set_thread_state(root_thread, ompt_state_work_serial);
3814  }
3815#endif
3816
3817  KMP_MB();
3818  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3819
3820  return gtid;
3821}
3822
3823#if KMP_NESTED_HOT_TEAMS
3824static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3825                                const int max_level) {
3826  int i, n, nth;
3827  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3828  if (!hot_teams || !hot_teams[level].hot_team) {
3829    return 0;
3830  }
3831  KMP_DEBUG_ASSERT(level < max_level);
3832  kmp_team_t *team = hot_teams[level].hot_team;
3833  nth = hot_teams[level].hot_team_nth;
3834  n = nth - 1; // master is not freed
3835  if (level < max_level - 1) {
3836    for (i = 0; i < nth; ++i) {
3837      kmp_info_t *th = team->t.t_threads[i];
3838      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3839      if (i > 0 && th->th.th_hot_teams) {
3840        __kmp_free(th->th.th_hot_teams);
3841        th->th.th_hot_teams = NULL;
3842      }
3843    }
3844  }
3845  __kmp_free_team(root, team, NULL);
3846  return n;
3847}
3848#endif
3849
3850// Resets a root thread and clear its root and hot teams.
3851// Returns the number of __kmp_threads entries directly and indirectly freed.
3852static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3853  kmp_team_t *root_team = root->r.r_root_team;
3854  kmp_team_t *hot_team = root->r.r_hot_team;
3855  int n = hot_team->t.t_nproc;
3856  int i;
3857
3858  KMP_DEBUG_ASSERT(!root->r.r_active);
3859
3860  root->r.r_root_team = NULL;
3861  root->r.r_hot_team = NULL;
3862  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3863  // before call to __kmp_free_team().
3864  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3865#if KMP_NESTED_HOT_TEAMS
3866  if (__kmp_hot_teams_max_level >
3867      0) { // need to free nested hot teams and their threads if any
3868    for (i = 0; i < hot_team->t.t_nproc; ++i) {
3869      kmp_info_t *th = hot_team->t.t_threads[i];
3870      if (__kmp_hot_teams_max_level > 1) {
3871        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3872      }
3873      if (th->th.th_hot_teams) {
3874        __kmp_free(th->th.th_hot_teams);
3875        th->th.th_hot_teams = NULL;
3876      }
3877    }
3878  }
3879#endif
3880  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3881
3882  // Before we can reap the thread, we need to make certain that all other
3883  // threads in the teams that had this root as ancestor have stopped trying to
3884  // steal tasks.
3885  if (__kmp_tasking_mode != tskm_immediate_exec) {
3886    __kmp_wait_to_unref_task_teams();
3887  }
3888
3889#if KMP_OS_WINDOWS
3890  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3891  KA_TRACE(
3892      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3893           "\n",
3894           (LPVOID) & (root->r.r_uber_thread->th),
3895           root->r.r_uber_thread->th.th_info.ds.ds_thread));
3896  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3897#endif /* KMP_OS_WINDOWS */
3898
3899#if OMPT_SUPPORT
3900  ompt_data_t *task_data;
3901  ompt_data_t *parallel_data;
3902  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3903  if (ompt_enabled.ompt_callback_implicit_task) {
3904    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3905        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3906  }
3907  if (ompt_enabled.ompt_callback_thread_end) {
3908    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3909        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3910  }
3911#endif
3912
3913  TCW_4(__kmp_nth,
3914        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3915  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3916  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3917                 " to %d\n",
3918                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3919                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3920  if (i == 1) {
3921    // need to free contention group structure
3922    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3923                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
3924    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3925    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3926    root->r.r_uber_thread->th.th_cg_roots = NULL;
3927  }
3928  __kmp_reap_thread(root->r.r_uber_thread, 1);
3929
3930  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3931  // instead of freeing.
3932  root->r.r_uber_thread = NULL;
3933  /* mark root as no longer in use */
3934  root->r.r_begin = FALSE;
3935
3936  return n;
3937}
3938
3939void __kmp_unregister_root_current_thread(int gtid) {
3940  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3941  /* this lock should be ok, since unregister_root_current_thread is never
3942     called during an abort, only during a normal close. furthermore, if you
3943     have the forkjoin lock, you should never try to get the initz lock */
3944  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3945  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3946    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3947                  "exiting T#%d\n",
3948                  gtid));
3949    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3950    return;
3951  }
3952  kmp_root_t *root = __kmp_root[gtid];
3953
3954  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3955  KMP_ASSERT(KMP_UBER_GTID(gtid));
3956  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3957  KMP_ASSERT(root->r.r_active == FALSE);
3958
3959  KMP_MB();
3960
3961  kmp_info_t *thread = __kmp_threads[gtid];
3962  kmp_team_t *team = thread->th.th_team;
3963  kmp_task_team_t *task_team = thread->th.th_task_team;
3964
3965  // we need to wait for the proxy tasks before finishing the thread
3966  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3967#if OMPT_SUPPORT
3968    // the runtime is shutting down so we won't report any events
3969    thread->th.ompt_thread_info.state = ompt_state_undefined;
3970#endif
3971    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3972  }
3973
3974  __kmp_reset_root(gtid, root);
3975
3976  /* free up this thread slot */
3977  __kmp_gtid_set_specific(KMP_GTID_DNE);
3978#ifdef KMP_TDATA_GTID
3979  __kmp_gtid = KMP_GTID_DNE;
3980#endif
3981
3982  KMP_MB();
3983  KC_TRACE(10,
3984           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3985
3986  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3987}
3988
3989#if KMP_OS_WINDOWS
3990/* __kmp_forkjoin_lock must be already held
3991   Unregisters a root thread that is not the current thread.  Returns the number
3992   of __kmp_threads entries freed as a result. */
3993static int __kmp_unregister_root_other_thread(int gtid) {
3994  kmp_root_t *root = __kmp_root[gtid];
3995  int r;
3996
3997  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3998  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3999  KMP_ASSERT(KMP_UBER_GTID(gtid));
4000  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4001  KMP_ASSERT(root->r.r_active == FALSE);
4002
4003  r = __kmp_reset_root(gtid, root);
4004  KC_TRACE(10,
4005           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4006  return r;
4007}
4008#endif
4009
4010#if KMP_DEBUG
4011void __kmp_task_info() {
4012
4013  kmp_int32 gtid = __kmp_entry_gtid();
4014  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4015  kmp_info_t *this_thr = __kmp_threads[gtid];
4016  kmp_team_t *steam = this_thr->th.th_serial_team;
4017  kmp_team_t *team = this_thr->th.th_team;
4018
4019  __kmp_printf(
4020      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4021      "ptask=%p\n",
4022      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4023      team->t.t_implicit_task_taskdata[tid].td_parent);
4024}
4025#endif // KMP_DEBUG
4026
4027/* TODO optimize with one big memclr, take out what isn't needed, split
4028   responsibility to workers as much as possible, and delay initialization of
4029   features as much as possible  */
4030static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4031                                  int tid, int gtid) {
4032  /* this_thr->th.th_info.ds.ds_gtid is setup in
4033     kmp_allocate_thread/create_worker.
4034     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4035  kmp_info_t *master = team->t.t_threads[0];
4036  KMP_DEBUG_ASSERT(this_thr != NULL);
4037  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4038  KMP_DEBUG_ASSERT(team);
4039  KMP_DEBUG_ASSERT(team->t.t_threads);
4040  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4041  KMP_DEBUG_ASSERT(master);
4042  KMP_DEBUG_ASSERT(master->th.th_root);
4043
4044  KMP_MB();
4045
4046  TCW_SYNC_PTR(this_thr->th.th_team, team);
4047
4048  this_thr->th.th_info.ds.ds_tid = tid;
4049  this_thr->th.th_set_nproc = 0;
4050  if (__kmp_tasking_mode != tskm_immediate_exec)
4051    // When tasking is possible, threads are not safe to reap until they are
4052    // done tasking; this will be set when tasking code is exited in wait
4053    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4054  else // no tasking --> always safe to reap
4055    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4056  this_thr->th.th_set_proc_bind = proc_bind_default;
4057#if KMP_AFFINITY_SUPPORTED
4058  this_thr->th.th_new_place = this_thr->th.th_current_place;
4059#endif
4060  this_thr->th.th_root = master->th.th_root;
4061
4062  /* setup the thread's cache of the team structure */
4063  this_thr->th.th_team_nproc = team->t.t_nproc;
4064  this_thr->th.th_team_master = master;
4065  this_thr->th.th_team_serialized = team->t.t_serialized;
4066  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4067
4068  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4069
4070  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4071                tid, gtid, this_thr, this_thr->th.th_current_task));
4072
4073  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4074                           team, tid, TRUE);
4075
4076  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4077                tid, gtid, this_thr, this_thr->th.th_current_task));
4078  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4079  // __kmp_initialize_team()?
4080
4081  /* TODO no worksharing in speculative threads */
4082  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4083
4084  this_thr->th.th_local.this_construct = 0;
4085
4086  if (!this_thr->th.th_pri_common) {
4087    this_thr->th.th_pri_common =
4088        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4089    if (__kmp_storage_map) {
4090      __kmp_print_storage_map_gtid(
4091          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4092          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4093    }
4094    this_thr->th.th_pri_head = NULL;
4095  }
4096
4097  if (this_thr != master && // Master's CG root is initialized elsewhere
4098      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4099    // Make new thread's CG root same as master's
4100    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4101    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4102    if (tmp) {
4103      // worker changes CG, need to check if old CG should be freed
4104      int i = tmp->cg_nthreads--;
4105      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4106                     " on node %p of thread %p to %d\n",
4107                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4108      if (i == 1) {
4109        __kmp_free(tmp); // last thread left CG --> free it
4110      }
4111    }
4112    this_thr->th.th_cg_roots = master->th.th_cg_roots;
4113    // Increment new thread's CG root's counter to add the new thread
4114    this_thr->th.th_cg_roots->cg_nthreads++;
4115    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4116                   " node %p of thread %p to %d\n",
4117                   this_thr, this_thr->th.th_cg_roots,
4118                   this_thr->th.th_cg_roots->cg_root,
4119                   this_thr->th.th_cg_roots->cg_nthreads));
4120    this_thr->th.th_current_task->td_icvs.thread_limit =
4121        this_thr->th.th_cg_roots->cg_thread_limit;
4122  }
4123
4124  /* Initialize dynamic dispatch */
4125  {
4126    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4127    // Use team max_nproc since this will never change for the team.
4128    size_t disp_size =
4129        sizeof(dispatch_private_info_t) *
4130        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4131    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4132                  team->t.t_max_nproc));
4133    KMP_ASSERT(dispatch);
4134    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4135    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4136
4137    dispatch->th_disp_index = 0;
4138    dispatch->th_doacross_buf_idx = 0;
4139    if (!dispatch->th_disp_buffer) {
4140      dispatch->th_disp_buffer =
4141          (dispatch_private_info_t *)__kmp_allocate(disp_size);
4142
4143      if (__kmp_storage_map) {
4144        __kmp_print_storage_map_gtid(
4145            gtid, &dispatch->th_disp_buffer[0],
4146            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4147                                          ? 1
4148                                          : __kmp_dispatch_num_buffers],
4149            disp_size, "th_%d.th_dispatch.th_disp_buffer "
4150                       "(team_%d.t_dispatch[%d].th_disp_buffer)",
4151            gtid, team->t.t_id, gtid);
4152      }
4153    } else {
4154      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4155    }
4156
4157    dispatch->th_dispatch_pr_current = 0;
4158    dispatch->th_dispatch_sh_current = 0;
4159
4160    dispatch->th_deo_fcn = 0; /* ORDERED     */
4161    dispatch->th_dxo_fcn = 0; /* END ORDERED */
4162  }
4163
4164  this_thr->th.th_next_pool = NULL;
4165
4166  if (!this_thr->th.th_task_state_memo_stack) {
4167    size_t i;
4168    this_thr->th.th_task_state_memo_stack =
4169        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4170    this_thr->th.th_task_state_top = 0;
4171    this_thr->th.th_task_state_stack_sz = 4;
4172    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4173         ++i) // zero init the stack
4174      this_thr->th.th_task_state_memo_stack[i] = 0;
4175  }
4176
4177  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4178  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4179
4180  KMP_MB();
4181}
4182
4183/* allocate a new thread for the requesting team. this is only called from
4184   within a forkjoin critical section. we will first try to get an available
4185   thread from the thread pool. if none is available, we will fork a new one
4186   assuming we are able to create a new one. this should be assured, as the
4187   caller should check on this first. */
4188kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4189                                  int new_tid) {
4190  kmp_team_t *serial_team;
4191  kmp_info_t *new_thr;
4192  int new_gtid;
4193
4194  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4195  KMP_DEBUG_ASSERT(root && team);
4196#if !KMP_NESTED_HOT_TEAMS
4197  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4198#endif
4199  KMP_MB();
4200
4201  /* first, try to get one from the thread pool */
4202  if (__kmp_thread_pool) {
4203    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4204    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4205    if (new_thr == __kmp_thread_pool_insert_pt) {
4206      __kmp_thread_pool_insert_pt = NULL;
4207    }
4208    TCW_4(new_thr->th.th_in_pool, FALSE);
4209    __kmp_suspend_initialize_thread(new_thr);
4210    __kmp_lock_suspend_mx(new_thr);
4211    if (new_thr->th.th_active_in_pool == TRUE) {
4212      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4213      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4214      new_thr->th.th_active_in_pool = FALSE;
4215    }
4216    __kmp_unlock_suspend_mx(new_thr);
4217
4218    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4219                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4220    KMP_ASSERT(!new_thr->th.th_team);
4221    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4222
4223    /* setup the thread structure */
4224    __kmp_initialize_info(new_thr, team, new_tid,
4225                          new_thr->th.th_info.ds.ds_gtid);
4226    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4227
4228    TCW_4(__kmp_nth, __kmp_nth + 1);
4229
4230    new_thr->th.th_task_state = 0;
4231    new_thr->th.th_task_state_top = 0;
4232    new_thr->th.th_task_state_stack_sz = 4;
4233
4234#ifdef KMP_ADJUST_BLOCKTIME
4235    /* Adjust blocktime back to zero if necessary */
4236    /* Middle initialization might not have occurred yet */
4237    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4238      if (__kmp_nth > __kmp_avail_proc) {
4239        __kmp_zero_bt = TRUE;
4240      }
4241    }
4242#endif /* KMP_ADJUST_BLOCKTIME */
4243
4244#if KMP_DEBUG
4245    // If thread entered pool via __kmp_free_thread, wait_flag should !=
4246    // KMP_BARRIER_PARENT_FLAG.
4247    int b;
4248    kmp_balign_t *balign = new_thr->th.th_bar;
4249    for (b = 0; b < bs_last_barrier; ++b)
4250      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4251#endif
4252
4253    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4254                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4255
4256    KMP_MB();
4257    return new_thr;
4258  }
4259
4260  /* no, well fork a new one */
4261  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4262  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4263
4264#if KMP_USE_MONITOR
4265  // If this is the first worker thread the RTL is creating, then also
4266  // launch the monitor thread.  We try to do this as early as possible.
4267  if (!TCR_4(__kmp_init_monitor)) {
4268    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4269    if (!TCR_4(__kmp_init_monitor)) {
4270      KF_TRACE(10, ("before __kmp_create_monitor\n"));
4271      TCW_4(__kmp_init_monitor, 1);
4272      __kmp_create_monitor(&__kmp_monitor);
4273      KF_TRACE(10, ("after __kmp_create_monitor\n"));
4274#if KMP_OS_WINDOWS
4275      // AC: wait until monitor has started. This is a fix for CQ232808.
4276      // The reason is that if the library is loaded/unloaded in a loop with
4277      // small (parallel) work in between, then there is high probability that
4278      // monitor thread started after the library shutdown. At shutdown it is
4279      // too late to cope with the problem, because when the master is in
4280      // DllMain (process detach) the monitor has no chances to start (it is
4281      // blocked), and master has no means to inform the monitor that the
4282      // library has gone, because all the memory which the monitor can access
4283      // is going to be released/reset.
4284      while (TCR_4(__kmp_init_monitor) < 2) {
4285        KMP_YIELD(TRUE);
4286      }
4287      KF_TRACE(10, ("after monitor thread has started\n"));
4288#endif
4289    }
4290    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4291  }
4292#endif
4293
4294  KMP_MB();
4295  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4296    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4297  }
4298
4299  /* allocate space for it. */
4300  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4301
4302  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4303
4304  if (__kmp_storage_map) {
4305    __kmp_print_thread_storage_map(new_thr, new_gtid);
4306  }
4307
4308  // add the reserve serialized team, initialized from the team's master thread
4309  {
4310    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4311    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4312    new_thr->th.th_serial_team = serial_team =
4313        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4314#if OMPT_SUPPORT
4315                                          ompt_data_none, // root parallel id
4316#endif
4317                                          proc_bind_default, &r_icvs,
4318                                          0 USE_NESTED_HOT_ARG(NULL));
4319  }
4320  KMP_ASSERT(serial_team);
4321  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4322  // execution (it is unused for now).
4323  serial_team->t.t_threads[0] = new_thr;
4324  KF_TRACE(10,
4325           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4326            new_thr));
4327
4328  /* setup the thread structures */
4329  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4330
4331#if USE_FAST_MEMORY
4332  __kmp_initialize_fast_memory(new_thr);
4333#endif /* USE_FAST_MEMORY */
4334
4335#if KMP_USE_BGET
4336  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4337  __kmp_initialize_bget(new_thr);
4338#endif
4339
4340  __kmp_init_random(new_thr); // Initialize random number generator
4341
4342  /* Initialize these only once when thread is grabbed for a team allocation */
4343  KA_TRACE(20,
4344           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4345            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4346
4347  int b;
4348  kmp_balign_t *balign = new_thr->th.th_bar;
4349  for (b = 0; b < bs_last_barrier; ++b) {
4350    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4351    balign[b].bb.team = NULL;
4352    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4353    balign[b].bb.use_oncore_barrier = 0;
4354  }
4355
4356  new_thr->th.th_spin_here = FALSE;
4357  new_thr->th.th_next_waiting = 0;
4358#if KMP_OS_UNIX
4359  new_thr->th.th_blocking = false;
4360#endif
4361
4362#if KMP_AFFINITY_SUPPORTED
4363  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4364  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4365  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4366  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4367#endif
4368  new_thr->th.th_def_allocator = __kmp_def_allocator;
4369  new_thr->th.th_prev_level = 0;
4370  new_thr->th.th_prev_num_threads = 1;
4371
4372  TCW_4(new_thr->th.th_in_pool, FALSE);
4373  new_thr->th.th_active_in_pool = FALSE;
4374  TCW_4(new_thr->th.th_active, TRUE);
4375
4376  /* adjust the global counters */
4377  __kmp_all_nth++;
4378  __kmp_nth++;
4379
4380  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4381  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4382  if (__kmp_adjust_gtid_mode) {
4383    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4384      if (TCR_4(__kmp_gtid_mode) != 2) {
4385        TCW_4(__kmp_gtid_mode, 2);
4386      }
4387    } else {
4388      if (TCR_4(__kmp_gtid_mode) != 1) {
4389        TCW_4(__kmp_gtid_mode, 1);
4390      }
4391    }
4392  }
4393
4394#ifdef KMP_ADJUST_BLOCKTIME
4395  /* Adjust blocktime back to zero if necessary       */
4396  /* Middle initialization might not have occurred yet */
4397  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4398    if (__kmp_nth > __kmp_avail_proc) {
4399      __kmp_zero_bt = TRUE;
4400    }
4401  }
4402#endif /* KMP_ADJUST_BLOCKTIME */
4403
4404  /* actually fork it and create the new worker thread */
4405  KF_TRACE(
4406      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4407  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4408  KF_TRACE(10,
4409           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4410
4411  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4412                new_gtid));
4413  KMP_MB();
4414  return new_thr;
4415}
4416
4417/* Reinitialize team for reuse.
4418   The hot team code calls this case at every fork barrier, so EPCC barrier
4419   test are extremely sensitive to changes in it, esp. writes to the team
4420   struct, which cause a cache invalidation in all threads.
4421   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4422static void __kmp_reinitialize_team(kmp_team_t *team,
4423                                    kmp_internal_control_t *new_icvs,
4424                                    ident_t *loc) {
4425  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4426                team->t.t_threads[0], team));
4427  KMP_DEBUG_ASSERT(team && new_icvs);
4428  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4429  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4430
4431  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4432  // Copy ICVs to the master thread's implicit taskdata
4433  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4434  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4435
4436  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4437                team->t.t_threads[0], team));
4438}
4439
4440/* Initialize the team data structure.
4441   This assumes the t_threads and t_max_nproc are already set.
4442   Also, we don't touch the arguments */
4443static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4444                                  kmp_internal_control_t *new_icvs,
4445                                  ident_t *loc) {
4446  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4447
4448  /* verify */
4449  KMP_DEBUG_ASSERT(team);
4450  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4451  KMP_DEBUG_ASSERT(team->t.t_threads);
4452  KMP_MB();
4453
4454  team->t.t_master_tid = 0; /* not needed */
4455  /* team->t.t_master_bar;        not needed */
4456  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4457  team->t.t_nproc = new_nproc;
4458
4459  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4460  team->t.t_next_pool = NULL;
4461  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4462   * up hot team */
4463
4464  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4465  team->t.t_invoke = NULL; /* not needed */
4466
4467  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4468  team->t.t_sched.sched = new_icvs->sched.sched;
4469
4470#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4471  team->t.t_fp_control_saved = FALSE; /* not needed */
4472  team->t.t_x87_fpu_control_word = 0; /* not needed */
4473  team->t.t_mxcsr = 0; /* not needed */
4474#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4475
4476  team->t.t_construct = 0;
4477
4478  team->t.t_ordered.dt.t_value = 0;
4479  team->t.t_master_active = FALSE;
4480
4481#ifdef KMP_DEBUG
4482  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4483#endif
4484#if KMP_OS_WINDOWS
4485  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4486#endif
4487
4488  team->t.t_control_stack_top = NULL;
4489
4490  __kmp_reinitialize_team(team, new_icvs, loc);
4491
4492  KMP_MB();
4493  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4494}
4495
4496#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4497/* Sets full mask for thread and returns old mask, no changes to structures. */
4498static void
4499__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4500  if (KMP_AFFINITY_CAPABLE()) {
4501    int status;
4502    if (old_mask != NULL) {
4503      status = __kmp_get_system_affinity(old_mask, TRUE);
4504      int error = errno;
4505      if (status != 0) {
4506        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4507                    __kmp_msg_null);
4508      }
4509    }
4510    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4511  }
4512}
4513#endif
4514
4515#if KMP_AFFINITY_SUPPORTED
4516
4517// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4518// It calculats the worker + master thread's partition based upon the parent
4519// thread's partition, and binds each worker to a thread in their partition.
4520// The master thread's partition should already include its current binding.
4521static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4522  // Copy the master thread's place partion to the team struct
4523  kmp_info_t *master_th = team->t.t_threads[0];
4524  KMP_DEBUG_ASSERT(master_th != NULL);
4525  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4526  int first_place = master_th->th.th_first_place;
4527  int last_place = master_th->th.th_last_place;
4528  int masters_place = master_th->th.th_current_place;
4529  team->t.t_first_place = first_place;
4530  team->t.t_last_place = last_place;
4531
4532  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4533                "bound to place %d partition = [%d,%d]\n",
4534                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4535                team->t.t_id, masters_place, first_place, last_place));
4536
4537  switch (proc_bind) {
4538
4539  case proc_bind_default:
4540    // serial teams might have the proc_bind policy set to proc_bind_default. It
4541    // doesn't matter, as we don't rebind master thread for any proc_bind policy
4542    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4543    break;
4544
4545  case proc_bind_master: {
4546    int f;
4547    int n_th = team->t.t_nproc;
4548    for (f = 1; f < n_th; f++) {
4549      kmp_info_t *th = team->t.t_threads[f];
4550      KMP_DEBUG_ASSERT(th != NULL);
4551      th->th.th_first_place = first_place;
4552      th->th.th_last_place = last_place;
4553      th->th.th_new_place = masters_place;
4554      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4555          team->t.t_display_affinity != 1) {
4556        team->t.t_display_affinity = 1;
4557      }
4558
4559      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4560                     "partition = [%d,%d]\n",
4561                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4562                     f, masters_place, first_place, last_place));
4563    }
4564  } break;
4565
4566  case proc_bind_close: {
4567    int f;
4568    int n_th = team->t.t_nproc;
4569    int n_places;
4570    if (first_place <= last_place) {
4571      n_places = last_place - first_place + 1;
4572    } else {
4573      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4574    }
4575    if (n_th <= n_places) {
4576      int place = masters_place;
4577      for (f = 1; f < n_th; f++) {
4578        kmp_info_t *th = team->t.t_threads[f];
4579        KMP_DEBUG_ASSERT(th != NULL);
4580
4581        if (place == last_place) {
4582          place = first_place;
4583        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4584          place = 0;
4585        } else {
4586          place++;
4587        }
4588        th->th.th_first_place = first_place;
4589        th->th.th_last_place = last_place;
4590        th->th.th_new_place = place;
4591        if (__kmp_display_affinity && place != th->th.th_current_place &&
4592            team->t.t_display_affinity != 1) {
4593          team->t.t_display_affinity = 1;
4594        }
4595
4596        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4597                       "partition = [%d,%d]\n",
4598                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4599                       team->t.t_id, f, place, first_place, last_place));
4600      }
4601    } else {
4602      int S, rem, gap, s_count;
4603      S = n_th / n_places;
4604      s_count = 0;
4605      rem = n_th - (S * n_places);
4606      gap = rem > 0 ? n_places / rem : n_places;
4607      int place = masters_place;
4608      int gap_ct = gap;
4609      for (f = 0; f < n_th; f++) {
4610        kmp_info_t *th = team->t.t_threads[f];
4611        KMP_DEBUG_ASSERT(th != NULL);
4612
4613        th->th.th_first_place = first_place;
4614        th->th.th_last_place = last_place;
4615        th->th.th_new_place = place;
4616        if (__kmp_display_affinity && place != th->th.th_current_place &&
4617            team->t.t_display_affinity != 1) {
4618          team->t.t_display_affinity = 1;
4619        }
4620        s_count++;
4621
4622        if ((s_count == S) && rem && (gap_ct == gap)) {
4623          // do nothing, add an extra thread to place on next iteration
4624        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4625          // we added an extra thread to this place; move to next place
4626          if (place == last_place) {
4627            place = first_place;
4628          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4629            place = 0;
4630          } else {
4631            place++;
4632          }
4633          s_count = 0;
4634          gap_ct = 1;
4635          rem--;
4636        } else if (s_count == S) { // place full; don't add extra
4637          if (place == last_place) {
4638            place = first_place;
4639          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4640            place = 0;
4641          } else {
4642            place++;
4643          }
4644          gap_ct++;
4645          s_count = 0;
4646        }
4647
4648        KA_TRACE(100,
4649                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4650                  "partition = [%d,%d]\n",
4651                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4652                  th->th.th_new_place, first_place, last_place));
4653      }
4654      KMP_DEBUG_ASSERT(place == masters_place);
4655    }
4656  } break;
4657
4658  case proc_bind_spread: {
4659    int f;
4660    int n_th = team->t.t_nproc;
4661    int n_places;
4662    int thidx;
4663    if (first_place <= last_place) {
4664      n_places = last_place - first_place + 1;
4665    } else {
4666      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4667    }
4668    if (n_th <= n_places) {
4669      int place = -1;
4670
4671      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4672        int S = n_places / n_th;
4673        int s_count, rem, gap, gap_ct;
4674
4675        place = masters_place;
4676        rem = n_places - n_th * S;
4677        gap = rem ? n_th / rem : 1;
4678        gap_ct = gap;
4679        thidx = n_th;
4680        if (update_master_only == 1)
4681          thidx = 1;
4682        for (f = 0; f < thidx; f++) {
4683          kmp_info_t *th = team->t.t_threads[f];
4684          KMP_DEBUG_ASSERT(th != NULL);
4685
4686          th->th.th_first_place = place;
4687          th->th.th_new_place = place;
4688          if (__kmp_display_affinity && place != th->th.th_current_place &&
4689              team->t.t_display_affinity != 1) {
4690            team->t.t_display_affinity = 1;
4691          }
4692          s_count = 1;
4693          while (s_count < S) {
4694            if (place == last_place) {
4695              place = first_place;
4696            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4697              place = 0;
4698            } else {
4699              place++;
4700            }
4701            s_count++;
4702          }
4703          if (rem && (gap_ct == gap)) {
4704            if (place == last_place) {
4705              place = first_place;
4706            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4707              place = 0;
4708            } else {
4709              place++;
4710            }
4711            rem--;
4712            gap_ct = 0;
4713          }
4714          th->th.th_last_place = place;
4715          gap_ct++;
4716
4717          if (place == last_place) {
4718            place = first_place;
4719          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4720            place = 0;
4721          } else {
4722            place++;
4723          }
4724
4725          KA_TRACE(100,
4726                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4727                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4728                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4729                    f, th->th.th_new_place, th->th.th_first_place,
4730                    th->th.th_last_place, __kmp_affinity_num_masks));
4731        }
4732      } else {
4733        /* Having uniform space of available computation places I can create
4734           T partitions of round(P/T) size and put threads into the first
4735           place of each partition. */
4736        double current = static_cast<double>(masters_place);
4737        double spacing =
4738            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4739        int first, last;
4740        kmp_info_t *th;
4741
4742        thidx = n_th + 1;
4743        if (update_master_only == 1)
4744          thidx = 1;
4745        for (f = 0; f < thidx; f++) {
4746          first = static_cast<int>(current);
4747          last = static_cast<int>(current + spacing) - 1;
4748          KMP_DEBUG_ASSERT(last >= first);
4749          if (first >= n_places) {
4750            if (masters_place) {
4751              first -= n_places;
4752              last -= n_places;
4753              if (first == (masters_place + 1)) {
4754                KMP_DEBUG_ASSERT(f == n_th);
4755                first--;
4756              }
4757              if (last == masters_place) {
4758                KMP_DEBUG_ASSERT(f == (n_th - 1));
4759                last--;
4760              }
4761            } else {
4762              KMP_DEBUG_ASSERT(f == n_th);
4763              first = 0;
4764              last = 0;
4765            }
4766          }
4767          if (last >= n_places) {
4768            last = (n_places - 1);
4769          }
4770          place = first;
4771          current += spacing;
4772          if (f < n_th) {
4773            KMP_DEBUG_ASSERT(0 <= first);
4774            KMP_DEBUG_ASSERT(n_places > first);
4775            KMP_DEBUG_ASSERT(0 <= last);
4776            KMP_DEBUG_ASSERT(n_places > last);
4777            KMP_DEBUG_ASSERT(last_place >= first_place);
4778            th = team->t.t_threads[f];
4779            KMP_DEBUG_ASSERT(th);
4780            th->th.th_first_place = first;
4781            th->th.th_new_place = place;
4782            th->th.th_last_place = last;
4783            if (__kmp_display_affinity && place != th->th.th_current_place &&
4784                team->t.t_display_affinity != 1) {
4785              team->t.t_display_affinity = 1;
4786            }
4787            KA_TRACE(100,
4788                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4789                      "partition = [%d,%d], spacing = %.4f\n",
4790                      __kmp_gtid_from_thread(team->t.t_threads[f]),
4791                      team->t.t_id, f, th->th.th_new_place,
4792                      th->th.th_first_place, th->th.th_last_place, spacing));
4793          }
4794        }
4795      }
4796      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4797    } else {
4798      int S, rem, gap, s_count;
4799      S = n_th / n_places;
4800      s_count = 0;
4801      rem = n_th - (S * n_places);
4802      gap = rem > 0 ? n_places / rem : n_places;
4803      int place = masters_place;
4804      int gap_ct = gap;
4805      thidx = n_th;
4806      if (update_master_only == 1)
4807        thidx = 1;
4808      for (f = 0; f < thidx; f++) {
4809        kmp_info_t *th = team->t.t_threads[f];
4810        KMP_DEBUG_ASSERT(th != NULL);
4811
4812        th->th.th_first_place = place;
4813        th->th.th_last_place = place;
4814        th->th.th_new_place = place;
4815        if (__kmp_display_affinity && place != th->th.th_current_place &&
4816            team->t.t_display_affinity != 1) {
4817          team->t.t_display_affinity = 1;
4818        }
4819        s_count++;
4820
4821        if ((s_count == S) && rem && (gap_ct == gap)) {
4822          // do nothing, add an extra thread to place on next iteration
4823        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4824          // we added an extra thread to this place; move on to next place
4825          if (place == last_place) {
4826            place = first_place;
4827          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4828            place = 0;
4829          } else {
4830            place++;
4831          }
4832          s_count = 0;
4833          gap_ct = 1;
4834          rem--;
4835        } else if (s_count == S) { // place is full; don't add extra thread
4836          if (place == last_place) {
4837            place = first_place;
4838          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4839            place = 0;
4840          } else {
4841            place++;
4842          }
4843          gap_ct++;
4844          s_count = 0;
4845        }
4846
4847        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4848                       "partition = [%d,%d]\n",
4849                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4850                       team->t.t_id, f, th->th.th_new_place,
4851                       th->th.th_first_place, th->th.th_last_place));
4852      }
4853      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4854    }
4855  } break;
4856
4857  default:
4858    break;
4859  }
4860
4861  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4862}
4863
4864#endif // KMP_AFFINITY_SUPPORTED
4865
4866/* allocate a new team data structure to use.  take one off of the free pool if
4867   available */
4868kmp_team_t *
4869__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4870#if OMPT_SUPPORT
4871                    ompt_data_t ompt_parallel_data,
4872#endif
4873                    kmp_proc_bind_t new_proc_bind,
4874                    kmp_internal_control_t *new_icvs,
4875                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4876  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4877  int f;
4878  kmp_team_t *team;
4879  int use_hot_team = !root->r.r_active;
4880  int level = 0;
4881
4882  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4883  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4884  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4885  KMP_MB();
4886
4887#if KMP_NESTED_HOT_TEAMS
4888  kmp_hot_team_ptr_t *hot_teams;
4889  if (master) {
4890    team = master->th.th_team;
4891    level = team->t.t_active_level;
4892    if (master->th.th_teams_microtask) { // in teams construct?
4893      if (master->th.th_teams_size.nteams > 1 &&
4894          ( // #teams > 1
4895              team->t.t_pkfn ==
4896                  (microtask_t)__kmp_teams_master || // inner fork of the teams
4897              master->th.th_teams_level <
4898                  team->t.t_level)) { // or nested parallel inside the teams
4899        ++level; // not increment if #teams==1, or for outer fork of the teams;
4900        // increment otherwise
4901      }
4902    }
4903    hot_teams = master->th.th_hot_teams;
4904    if (level < __kmp_hot_teams_max_level && hot_teams &&
4905        hot_teams[level]
4906            .hot_team) { // hot team has already been allocated for given level
4907      use_hot_team = 1;
4908    } else {
4909      use_hot_team = 0;
4910    }
4911  }
4912#endif
4913  // Optimization to use a "hot" team
4914  if (use_hot_team && new_nproc > 1) {
4915    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4916#if KMP_NESTED_HOT_TEAMS
4917    team = hot_teams[level].hot_team;
4918#else
4919    team = root->r.r_hot_team;
4920#endif
4921#if KMP_DEBUG
4922    if (__kmp_tasking_mode != tskm_immediate_exec) {
4923      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4924                    "task_team[1] = %p before reinit\n",
4925                    team->t.t_task_team[0], team->t.t_task_team[1]));
4926    }
4927#endif
4928
4929    // Has the number of threads changed?
4930    /* Let's assume the most common case is that the number of threads is
4931       unchanged, and put that case first. */
4932    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4933      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4934      // This case can mean that omp_set_num_threads() was called and the hot
4935      // team size was already reduced, so we check the special flag
4936      if (team->t.t_size_changed == -1) {
4937        team->t.t_size_changed = 1;
4938      } else {
4939        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4940      }
4941
4942      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4943      kmp_r_sched_t new_sched = new_icvs->sched;
4944      // set master's schedule as new run-time schedule
4945      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4946
4947      __kmp_reinitialize_team(team, new_icvs,
4948                              root->r.r_uber_thread->th.th_ident);
4949
4950      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4951                    team->t.t_threads[0], team));
4952      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4953
4954#if KMP_AFFINITY_SUPPORTED
4955      if ((team->t.t_size_changed == 0) &&
4956          (team->t.t_proc_bind == new_proc_bind)) {
4957        if (new_proc_bind == proc_bind_spread) {
4958          __kmp_partition_places(
4959              team, 1); // add flag to update only master for spread
4960        }
4961        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4962                       "proc_bind = %d, partition = [%d,%d]\n",
4963                       team->t.t_id, new_proc_bind, team->t.t_first_place,
4964                       team->t.t_last_place));
4965      } else {
4966        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4967        __kmp_partition_places(team);
4968      }
4969#else
4970      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4971#endif /* KMP_AFFINITY_SUPPORTED */
4972    } else if (team->t.t_nproc > new_nproc) {
4973      KA_TRACE(20,
4974               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4975                new_nproc));
4976
4977      team->t.t_size_changed = 1;
4978#if KMP_NESTED_HOT_TEAMS
4979      if (__kmp_hot_teams_mode == 0) {
4980        // AC: saved number of threads should correspond to team's value in this
4981        // mode, can be bigger in mode 1, when hot team has threads in reserve
4982        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4983        hot_teams[level].hot_team_nth = new_nproc;
4984#endif // KMP_NESTED_HOT_TEAMS
4985        /* release the extra threads we don't need any more */
4986        for (f = new_nproc; f < team->t.t_nproc; f++) {
4987          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4988          if (__kmp_tasking_mode != tskm_immediate_exec) {
4989            // When decreasing team size, threads no longer in the team should
4990            // unref task team.
4991            team->t.t_threads[f]->th.th_task_team = NULL;
4992          }
4993          __kmp_free_thread(team->t.t_threads[f]);
4994          team->t.t_threads[f] = NULL;
4995        }
4996#if KMP_NESTED_HOT_TEAMS
4997      } // (__kmp_hot_teams_mode == 0)
4998      else {
4999        // When keeping extra threads in team, switch threads to wait on own
5000        // b_go flag
5001        for (f = new_nproc; f < team->t.t_nproc; ++f) {
5002          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5003          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5004          for (int b = 0; b < bs_last_barrier; ++b) {
5005            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5006              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5007            }
5008            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5009          }
5010        }
5011      }
5012#endif // KMP_NESTED_HOT_TEAMS
5013      team->t.t_nproc = new_nproc;
5014      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5015      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5016      __kmp_reinitialize_team(team, new_icvs,
5017                              root->r.r_uber_thread->th.th_ident);
5018
5019      // Update remaining threads
5020      for (f = 0; f < new_nproc; ++f) {
5021        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5022      }
5023
5024      // restore the current task state of the master thread: should be the
5025      // implicit task
5026      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5027                    team->t.t_threads[0], team));
5028
5029      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5030
5031#ifdef KMP_DEBUG
5032      for (f = 0; f < team->t.t_nproc; f++) {
5033        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5034                         team->t.t_threads[f]->th.th_team_nproc ==
5035                             team->t.t_nproc);
5036      }
5037#endif
5038
5039      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5040#if KMP_AFFINITY_SUPPORTED
5041      __kmp_partition_places(team);
5042#endif
5043    } else { // team->t.t_nproc < new_nproc
5044#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5045      kmp_affin_mask_t *old_mask;
5046      if (KMP_AFFINITY_CAPABLE()) {
5047        KMP_CPU_ALLOC(old_mask);
5048      }
5049#endif
5050
5051      KA_TRACE(20,
5052               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5053                new_nproc));
5054
5055      team->t.t_size_changed = 1;
5056
5057#if KMP_NESTED_HOT_TEAMS
5058      int avail_threads = hot_teams[level].hot_team_nth;
5059      if (new_nproc < avail_threads)
5060        avail_threads = new_nproc;
5061      kmp_info_t **other_threads = team->t.t_threads;
5062      for (f = team->t.t_nproc; f < avail_threads; ++f) {
5063        // Adjust barrier data of reserved threads (if any) of the team
5064        // Other data will be set in __kmp_initialize_info() below.
5065        int b;
5066        kmp_balign_t *balign = other_threads[f]->th.th_bar;
5067        for (b = 0; b < bs_last_barrier; ++b) {
5068          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5069          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5070#if USE_DEBUGGER
5071          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5072#endif
5073        }
5074      }
5075      if (hot_teams[level].hot_team_nth >= new_nproc) {
5076        // we have all needed threads in reserve, no need to allocate any
5077        // this only possible in mode 1, cannot have reserved threads in mode 0
5078        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5079        team->t.t_nproc = new_nproc; // just get reserved threads involved
5080      } else {
5081        // we may have some threads in reserve, but not enough
5082        team->t.t_nproc =
5083            hot_teams[level]
5084                .hot_team_nth; // get reserved threads involved if any
5085        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5086#endif // KMP_NESTED_HOT_TEAMS
5087        if (team->t.t_max_nproc < new_nproc) {
5088          /* reallocate larger arrays */
5089          __kmp_reallocate_team_arrays(team, new_nproc);
5090          __kmp_reinitialize_team(team, new_icvs, NULL);
5091        }
5092
5093#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5094        /* Temporarily set full mask for master thread before creation of
5095           workers. The reason is that workers inherit the affinity from master,
5096           so if a lot of workers are created on the single core quickly, they
5097           don't get a chance to set their own affinity for a long time. */
5098        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5099#endif
5100
5101        /* allocate new threads for the hot team */
5102        for (f = team->t.t_nproc; f < new_nproc; f++) {
5103          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5104          KMP_DEBUG_ASSERT(new_worker);
5105          team->t.t_threads[f] = new_worker;
5106
5107          KA_TRACE(20,
5108                   ("__kmp_allocate_team: team %d init T#%d arrived: "
5109                    "join=%llu, plain=%llu\n",
5110                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5111                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5112                    team->t.t_bar[bs_plain_barrier].b_arrived));
5113
5114          { // Initialize barrier data for new threads.
5115            int b;
5116            kmp_balign_t *balign = new_worker->th.th_bar;
5117            for (b = 0; b < bs_last_barrier; ++b) {
5118              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5119              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5120                               KMP_BARRIER_PARENT_FLAG);
5121#if USE_DEBUGGER
5122              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5123#endif
5124            }
5125          }
5126        }
5127
5128#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5129        if (KMP_AFFINITY_CAPABLE()) {
5130          /* Restore initial master thread's affinity mask */
5131          __kmp_set_system_affinity(old_mask, TRUE);
5132          KMP_CPU_FREE(old_mask);
5133        }
5134#endif
5135#if KMP_NESTED_HOT_TEAMS
5136      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5137#endif // KMP_NESTED_HOT_TEAMS
5138      /* make sure everyone is syncronized */
5139      int old_nproc = team->t.t_nproc; // save old value and use to update only
5140      // new threads below
5141      __kmp_initialize_team(team, new_nproc, new_icvs,
5142                            root->r.r_uber_thread->th.th_ident);
5143
5144      /* reinitialize the threads */
5145      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5146      for (f = 0; f < team->t.t_nproc; ++f)
5147        __kmp_initialize_info(team->t.t_threads[f], team, f,
5148                              __kmp_gtid_from_tid(f, team));
5149
5150      if (level) { // set th_task_state for new threads in nested hot team
5151        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5152        // only need to set the th_task_state for the new threads. th_task_state
5153        // for master thread will not be accurate until after this in
5154        // __kmp_fork_call(), so we look to the master's memo_stack to get the
5155        // correct value.
5156        for (f = old_nproc; f < team->t.t_nproc; ++f)
5157          team->t.t_threads[f]->th.th_task_state =
5158              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5159      } else { // set th_task_state for new threads in non-nested hot team
5160        int old_state =
5161            team->t.t_threads[0]->th.th_task_state; // copy master's state
5162        for (f = old_nproc; f < team->t.t_nproc; ++f)
5163          team->t.t_threads[f]->th.th_task_state = old_state;
5164      }
5165
5166#ifdef KMP_DEBUG
5167      for (f = 0; f < team->t.t_nproc; ++f) {
5168        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5169                         team->t.t_threads[f]->th.th_team_nproc ==
5170                             team->t.t_nproc);
5171      }
5172#endif
5173
5174      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5175#if KMP_AFFINITY_SUPPORTED
5176      __kmp_partition_places(team);
5177#endif
5178    } // Check changes in number of threads
5179
5180    kmp_info_t *master = team->t.t_threads[0];
5181    if (master->th.th_teams_microtask) {
5182      for (f = 1; f < new_nproc; ++f) {
5183        // propagate teams construct specific info to workers
5184        kmp_info_t *thr = team->t.t_threads[f];
5185        thr->th.th_teams_microtask = master->th.th_teams_microtask;
5186        thr->th.th_teams_level = master->th.th_teams_level;
5187        thr->th.th_teams_size = master->th.th_teams_size;
5188      }
5189    }
5190#if KMP_NESTED_HOT_TEAMS
5191    if (level) {
5192      // Sync barrier state for nested hot teams, not needed for outermost hot
5193      // team.
5194      for (f = 1; f < new_nproc; ++f) {
5195        kmp_info_t *thr = team->t.t_threads[f];
5196        int b;
5197        kmp_balign_t *balign = thr->th.th_bar;
5198        for (b = 0; b < bs_last_barrier; ++b) {
5199          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5200          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5201#if USE_DEBUGGER
5202          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5203#endif
5204        }
5205      }
5206    }
5207#endif // KMP_NESTED_HOT_TEAMS
5208
5209    /* reallocate space for arguments if necessary */
5210    __kmp_alloc_argv_entries(argc, team, TRUE);
5211    KMP_CHECK_UPDATE(team->t.t_argc, argc);
5212    // The hot team re-uses the previous task team,
5213    // if untouched during the previous release->gather phase.
5214
5215    KF_TRACE(10, (" hot_team = %p\n", team));
5216
5217#if KMP_DEBUG
5218    if (__kmp_tasking_mode != tskm_immediate_exec) {
5219      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5220                    "task_team[1] = %p after reinit\n",
5221                    team->t.t_task_team[0], team->t.t_task_team[1]));
5222    }
5223#endif
5224
5225#if OMPT_SUPPORT
5226    __ompt_team_assign_id(team, ompt_parallel_data);
5227#endif
5228
5229    KMP_MB();
5230
5231    return team;
5232  }
5233
5234  /* next, let's try to take one from the team pool */
5235  KMP_MB();
5236  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5237    /* TODO: consider resizing undersized teams instead of reaping them, now
5238       that we have a resizing mechanism */
5239    if (team->t.t_max_nproc >= max_nproc) {
5240      /* take this team from the team pool */
5241      __kmp_team_pool = team->t.t_next_pool;
5242
5243      /* setup the team for fresh use */
5244      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5245
5246      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5247                    "task_team[1] %p to NULL\n",
5248                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
5249      team->t.t_task_team[0] = NULL;
5250      team->t.t_task_team[1] = NULL;
5251
5252      /* reallocate space for arguments if necessary */
5253      __kmp_alloc_argv_entries(argc, team, TRUE);
5254      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5255
5256      KA_TRACE(
5257          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5258               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5259      { // Initialize barrier data.
5260        int b;
5261        for (b = 0; b < bs_last_barrier; ++b) {
5262          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5263#if USE_DEBUGGER
5264          team->t.t_bar[b].b_master_arrived = 0;
5265          team->t.t_bar[b].b_team_arrived = 0;
5266#endif
5267        }
5268      }
5269
5270      team->t.t_proc_bind = new_proc_bind;
5271
5272      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5273                    team->t.t_id));
5274
5275#if OMPT_SUPPORT
5276      __ompt_team_assign_id(team, ompt_parallel_data);
5277#endif
5278
5279      KMP_MB();
5280
5281      return team;
5282    }
5283
5284    /* reap team if it is too small, then loop back and check the next one */
5285    // not sure if this is wise, but, will be redone during the hot-teams
5286    // rewrite.
5287    /* TODO: Use technique to find the right size hot-team, don't reap them */
5288    team = __kmp_reap_team(team);
5289    __kmp_team_pool = team;
5290  }
5291
5292  /* nothing available in the pool, no matter, make a new team! */
5293  KMP_MB();
5294  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5295
5296  /* and set it up */
5297  team->t.t_max_nproc = max_nproc;
5298  /* NOTE well, for some reason allocating one big buffer and dividing it up
5299     seems to really hurt performance a lot on the P4, so, let's not use this */
5300  __kmp_allocate_team_arrays(team, max_nproc);
5301
5302  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5303  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5304
5305  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5306                "%p to NULL\n",
5307                &team->t.t_task_team[0], &team->t.t_task_team[1]));
5308  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5309  // memory, no need to duplicate
5310  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5311  // memory, no need to duplicate
5312
5313  if (__kmp_storage_map) {
5314    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5315  }
5316
5317  /* allocate space for arguments */
5318  __kmp_alloc_argv_entries(argc, team, FALSE);
5319  team->t.t_argc = argc;
5320
5321  KA_TRACE(20,
5322           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5323            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5324  { // Initialize barrier data.
5325    int b;
5326    for (b = 0; b < bs_last_barrier; ++b) {
5327      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5328#if USE_DEBUGGER
5329      team->t.t_bar[b].b_master_arrived = 0;
5330      team->t.t_bar[b].b_team_arrived = 0;
5331#endif
5332    }
5333  }
5334
5335  team->t.t_proc_bind = new_proc_bind;
5336
5337#if OMPT_SUPPORT
5338  __ompt_team_assign_id(team, ompt_parallel_data);
5339  team->t.ompt_serialized_team_info = NULL;
5340#endif
5341
5342  KMP_MB();
5343
5344  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5345                team->t.t_id));
5346
5347  return team;
5348}
5349
5350/* TODO implement hot-teams at all levels */
5351/* TODO implement lazy thread release on demand (disband request) */
5352
5353/* free the team.  return it to the team pool.  release all the threads
5354 * associated with it */
5355void __kmp_free_team(kmp_root_t *root,
5356                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5357  int f;
5358  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5359                team->t.t_id));
5360
5361  /* verify state */
5362  KMP_DEBUG_ASSERT(root);
5363  KMP_DEBUG_ASSERT(team);
5364  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5365  KMP_DEBUG_ASSERT(team->t.t_threads);
5366
5367  int use_hot_team = team == root->r.r_hot_team;
5368#if KMP_NESTED_HOT_TEAMS
5369  int level;
5370  kmp_hot_team_ptr_t *hot_teams;
5371  if (master) {
5372    level = team->t.t_active_level - 1;
5373    if (master->th.th_teams_microtask) { // in teams construct?
5374      if (master->th.th_teams_size.nteams > 1) {
5375        ++level; // level was not increased in teams construct for
5376        // team_of_masters
5377      }
5378      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5379          master->th.th_teams_level == team->t.t_level) {
5380        ++level; // level was not increased in teams construct for
5381        // team_of_workers before the parallel
5382      } // team->t.t_level will be increased inside parallel
5383    }
5384    hot_teams = master->th.th_hot_teams;
5385    if (level < __kmp_hot_teams_max_level) {
5386      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5387      use_hot_team = 1;
5388    }
5389  }
5390#endif // KMP_NESTED_HOT_TEAMS
5391
5392  /* team is done working */
5393  TCW_SYNC_PTR(team->t.t_pkfn,
5394               NULL); // Important for Debugging Support Library.
5395#if KMP_OS_WINDOWS
5396  team->t.t_copyin_counter = 0; // init counter for possible reuse
5397#endif
5398  // Do not reset pointer to parent team to NULL for hot teams.
5399
5400  /* if we are non-hot team, release our threads */
5401  if (!use_hot_team) {
5402    if (__kmp_tasking_mode != tskm_immediate_exec) {
5403      // Wait for threads to reach reapable state
5404      for (f = 1; f < team->t.t_nproc; ++f) {
5405        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5406        kmp_info_t *th = team->t.t_threads[f];
5407        volatile kmp_uint32 *state = &th->th.th_reap_state;
5408        while (*state != KMP_SAFE_TO_REAP) {
5409#if KMP_OS_WINDOWS
5410          // On Windows a thread can be killed at any time, check this
5411          DWORD ecode;
5412          if (!__kmp_is_thread_alive(th, &ecode)) {
5413            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5414            break;
5415          }
5416#endif
5417          // first check if thread is sleeping
5418          kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5419          if (fl.is_sleeping())
5420            fl.resume(__kmp_gtid_from_thread(th));
5421          KMP_CPU_PAUSE();
5422        }
5423      }
5424
5425      // Delete task teams
5426      int tt_idx;
5427      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5428        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5429        if (task_team != NULL) {
5430          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5431            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5432            team->t.t_threads[f]->th.th_task_team = NULL;
5433          }
5434          KA_TRACE(
5435              20,
5436              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5437               __kmp_get_gtid(), task_team, team->t.t_id));
5438#if KMP_NESTED_HOT_TEAMS
5439          __kmp_free_task_team(master, task_team);
5440#endif
5441          team->t.t_task_team[tt_idx] = NULL;
5442        }
5443      }
5444    }
5445
5446    // Reset pointer to parent team only for non-hot teams.
5447    team->t.t_parent = NULL;
5448    team->t.t_level = 0;
5449    team->t.t_active_level = 0;
5450
5451    /* free the worker threads */
5452    for (f = 1; f < team->t.t_nproc; ++f) {
5453      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5454      __kmp_free_thread(team->t.t_threads[f]);
5455      team->t.t_threads[f] = NULL;
5456    }
5457
5458    /* put the team back in the team pool */
5459    /* TODO limit size of team pool, call reap_team if pool too large */
5460    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5461    __kmp_team_pool = (volatile kmp_team_t *)team;
5462  } else { // Check if team was created for the masters in a teams construct
5463    // See if first worker is a CG root
5464    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5465                     team->t.t_threads[1]->th.th_cg_roots);
5466    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5467      // Clean up the CG root nodes on workers so that this team can be re-used
5468      for (f = 1; f < team->t.t_nproc; ++f) {
5469        kmp_info_t *thr = team->t.t_threads[f];
5470        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5471                         thr->th.th_cg_roots->cg_root == thr);
5472        // Pop current CG root off list
5473        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5474        thr->th.th_cg_roots = tmp->up;
5475        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5476                       " up to node %p. cg_nthreads was %d\n",
5477                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5478        int i = tmp->cg_nthreads--;
5479        if (i == 1) {
5480          __kmp_free(tmp); // free CG if we are the last thread in it
5481        }
5482        // Restore current task's thread_limit from CG root
5483        if (thr->th.th_cg_roots)
5484          thr->th.th_current_task->td_icvs.thread_limit =
5485              thr->th.th_cg_roots->cg_thread_limit;
5486      }
5487    }
5488  }
5489
5490  KMP_MB();
5491}
5492
5493/* reap the team.  destroy it, reclaim all its resources and free its memory */
5494kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5495  kmp_team_t *next_pool = team->t.t_next_pool;
5496
5497  KMP_DEBUG_ASSERT(team);
5498  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5499  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5500  KMP_DEBUG_ASSERT(team->t.t_threads);
5501  KMP_DEBUG_ASSERT(team->t.t_argv);
5502
5503  /* TODO clean the threads that are a part of this? */
5504
5505  /* free stuff */
5506  __kmp_free_team_arrays(