1/*
2 * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_DISPATCH_HIER_H
14#define KMP_DISPATCH_HIER_H
15#include "kmp.h"
16#include "kmp_dispatch.h"
17
18// Layer type for scheduling hierarchy
19enum kmp_hier_layer_e {
20  LAYER_THREAD = -1,
21  LAYER_L1,
22  LAYER_L2,
23  LAYER_L3,
24  LAYER_NUMA,
25  LAYER_LOOP,
26  LAYER_LAST
27};
28
29// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
30static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
31  switch (type) {
32  case kmp_hier_layer_e::LAYER_THREAD:
33    return "THREAD";
34  case kmp_hier_layer_e::LAYER_L1:
35    return "L1";
36  case kmp_hier_layer_e::LAYER_L2:
37    return "L2";
38  case kmp_hier_layer_e::LAYER_L3:
39    return "L3";
40  case kmp_hier_layer_e::LAYER_NUMA:
41    return "NUMA";
42  case kmp_hier_layer_e::LAYER_LOOP:
43    return "WHOLE_LOOP";
44  case kmp_hier_layer_e::LAYER_LAST:
45    return "LAST";
46  }
47  KMP_ASSERT(0);
48  // Appease compilers, should never get here
49  return "ERROR";
50}
51
52// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
53typedef struct kmp_hier_sched_env_t {
54  int size;
55  int capacity;
56  enum sched_type *scheds;
57  kmp_int32 *small_chunks;
58  kmp_int64 *large_chunks;
59  kmp_hier_layer_e *layers;
60  // Append a level of the hierarchy
61  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
62    if (capacity == 0) {
63      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
64                                                 kmp_hier_layer_e::LAYER_LAST);
65      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
66                                                 kmp_hier_layer_e::LAYER_LAST);
67      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
68                                                 kmp_hier_layer_e::LAYER_LAST);
69      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
70                                                  kmp_hier_layer_e::LAYER_LAST);
71      capacity = kmp_hier_layer_e::LAYER_LAST;
72    }
73    int current_size = size;
74    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
75    scheds[current_size] = sched;
76    layers[current_size] = layer;
77    small_chunks[current_size] = chunk;
78    large_chunks[current_size] = (kmp_int64)chunk;
79    size++;
80  }
81  // Sort the hierarchy using selection sort, size will always be small
82  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
83  void sort() {
84    if (size <= 1)
85      return;
86    for (int i = 0; i < size; ++i) {
87      int switch_index = i;
88      for (int j = i + 1; j < size; ++j) {
89        if (layers[j] < layers[switch_index])
90          switch_index = j;
91      }
92      if (switch_index != i) {
93        kmp_hier_layer_e temp1 = layers[i];
94        enum sched_type temp2 = scheds[i];
95        kmp_int32 temp3 = small_chunks[i];
96        kmp_int64 temp4 = large_chunks[i];
97        layers[i] = layers[switch_index];
98        scheds[i] = scheds[switch_index];
99        small_chunks[i] = small_chunks[switch_index];
100        large_chunks[i] = large_chunks[switch_index];
101        layers[switch_index] = temp1;
102        scheds[switch_index] = temp2;
103        small_chunks[switch_index] = temp3;
104        large_chunks[switch_index] = temp4;
105      }
106    }
107  }
108  // Free all memory
109  void deallocate() {
110    if (capacity > 0) {
111      __kmp_free(scheds);
112      __kmp_free(layers);
113      __kmp_free(small_chunks);
114      __kmp_free(large_chunks);
115      scheds = NULL;
116      layers = NULL;
117      small_chunks = NULL;
118      large_chunks = NULL;
119    }
120    size = 0;
121    capacity = 0;
122  }
123} kmp_hier_sched_env_t;
124
125extern int __kmp_dispatch_hand_threading;
126extern kmp_hier_sched_env_t __kmp_hier_scheds;
127
128// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
129extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
130extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
131
132extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
133extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
134extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
135                                        kmp_hier_layer_e t2);
136extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
137
138template <typename T> struct kmp_hier_shared_bdata_t {
139  typedef typename traits_t<T>::signed_t ST;
140  volatile kmp_uint64 val[2];
141  kmp_int32 status[2];
142  T lb[2];
143  T ub[2];
144  ST st[2];
145  dispatch_shared_info_template<T> sh[2];
146  void zero() {
147    val[0] = val[1] = 0;
148    status[0] = status[1] = 0;
149    lb[0] = lb[1] = 0;
150    ub[0] = ub[1] = 0;
151    st[0] = st[1] = 0;
152    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
153  }
154  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
155                            kmp_uint64 index) {
156    lb[1 - index] = nlb;
157    ub[1 - index] = nub;
158    st[1 - index] = nst;
159    status[1 - index] = nstatus;
160  }
161  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
162    lb[1 - index] = nlb;
163    ub[1 - index] = nub;
164    st[1 - index] = nst;
165    status[1 - index] = nstatus;
166    sh[1 - index].u.s.iteration = 0;
167  }
168
169  kmp_int32 get_next_status(kmp_uint64 index) const {
170    return status[1 - index];
171  }
172  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
173  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
174  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
175  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
176    return &(sh[1 - index]);
177  }
178
179  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
180  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
181  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
182  ST get_curr_st(kmp_uint64 index) const { return st[index]; }
183  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
184    return &(sh[index]);
185  }
186};
187
188/*
189 * In the barrier implementations, num_active is the number of threads that are
190 * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
191 * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
192 * structure. tdata is the thread private data that resides on the thread
193 * data structure.
194 *
195 * The reset_shared() method is used to initialize the barrier data on the
196 * kmp_hier_top_unit_t hierarchy structure
197 *
198 * The reset_private() method is used to initialize the barrier data on the
199 * thread's private dispatch buffer structure
200 *
201 * The barrier() method takes an id, which is that thread's id for the
202 * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
203 * inside barrier() until all fellow threads who are attached to that
204 * kmp_hier_top_unit_t structure have arrived.
205 */
206
207// Core barrier implementation
208// Can be used in a unit with between 2 to 8 threads
209template <typename T> class core_barrier_impl {
210  static inline kmp_uint64 get_wait_val(int num_active) {
211    kmp_uint64 wait_val = 0LL;
212    switch (num_active) {
213    case 2:
214      wait_val = 0x0101LL;
215      break;
216    case 3:
217      wait_val = 0x010101LL;
218      break;
219    case 4:
220      wait_val = 0x01010101LL;
221      break;
222    case 5:
223      wait_val = 0x0101010101LL;
224      break;
225    case 6:
226      wait_val = 0x010101010101LL;
227      break;
228    case 7:
229      wait_val = 0x01010101010101LL;
230      break;
231    case 8:
232      wait_val = 0x0101010101010101LL;
233      break;
234    default:
235      // don't use the core_barrier_impl for more than 8 threads
236      KMP_ASSERT(0);
237    }
238    return wait_val;
239  }
240
241public:
242  static void reset_private(kmp_int32 num_active,
243                            kmp_hier_private_bdata_t *tdata);
244  static void reset_shared(kmp_int32 num_active,
245                           kmp_hier_shared_bdata_t<T> *bdata);
246  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
247                      kmp_hier_private_bdata_t *tdata);
248};
249
250template <typename T>
251void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
252                                         kmp_hier_private_bdata_t *tdata) {
253  tdata->num_active = num_active;
254  tdata->index = 0;
255  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
256}
257template <typename T>
258void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
259                                        kmp_hier_shared_bdata_t<T> *bdata) {
260  bdata->val[0] = bdata->val[1] = 0LL;
261  bdata->status[0] = bdata->status[1] = 0LL;
262}
263template <typename T>
264void core_barrier_impl<T>::barrier(kmp_int32 id,
265                                   kmp_hier_shared_bdata_t<T> *bdata,
266                                   kmp_hier_private_bdata_t *tdata) {
267  kmp_uint64 current_index = tdata->index;
268  kmp_uint64 next_index = 1 - current_index;
269  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
270  kmp_uint64 next_wait_value =
271      (current_wait_value ? 0 : get_wait_val(tdata->num_active));
272  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
273                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
274                __kmp_get_gtid(), current_index, next_index, current_wait_value,
275                next_wait_value));
276  char v = (current_wait_value ? 0x1 : 0x0);
277  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
278  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
279                         __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
280  tdata->wait_val[current_index] = next_wait_value;
281  tdata->index = next_index;
282}
283
284// Counter barrier implementation
285// Can be used in a unit with arbitrary number of active threads
286template <typename T> class counter_barrier_impl {
287public:
288  static void reset_private(kmp_int32 num_active,
289                            kmp_hier_private_bdata_t *tdata);
290  static void reset_shared(kmp_int32 num_active,
291                           kmp_hier_shared_bdata_t<T> *bdata);
292  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
293                      kmp_hier_private_bdata_t *tdata);
294};
295
296template <typename T>
297void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
298                                            kmp_hier_private_bdata_t *tdata) {
299  tdata->num_active = num_active;
300  tdata->index = 0;
301  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
302}
303template <typename T>
304void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
305                                           kmp_hier_shared_bdata_t<T> *bdata) {
306  bdata->val[0] = bdata->val[1] = 0LL;
307  bdata->status[0] = bdata->status[1] = 0LL;
308}
309template <typename T>
310void counter_barrier_impl<T>::barrier(kmp_int32 id,
311                                      kmp_hier_shared_bdata_t<T> *bdata,
312                                      kmp_hier_private_bdata_t *tdata) {
313  volatile kmp_int64 *val;
314  kmp_uint64 current_index = tdata->index;
315  kmp_uint64 next_index = 1 - current_index;
316  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
317  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
318
319  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
320                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
321                __kmp_get_gtid(), current_index, next_index, current_wait_value,
322                next_wait_value));
323  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
324  KMP_TEST_THEN_INC64(val);
325  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
326                         __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
327  tdata->wait_val[current_index] = next_wait_value;
328  tdata->index = next_index;
329}
330
331// Data associated with topology unit within a layer
332// For example, one kmp_hier_top_unit_t corresponds to one L1 cache
333template <typename T> struct kmp_hier_top_unit_t {
334  typedef typename traits_t<T>::signed_t ST;
335  typedef typename traits_t<T>::unsigned_t UT;
336  kmp_int32 active; // number of topology units that communicate with this unit
337  // chunk information (lower/upper bound, stride, etc.)
338  dispatch_private_info_template<T> hier_pr;
339  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
340  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
341
342  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
343  void reset_shared_barrier() {
344    KMP_DEBUG_ASSERT(active > 0);
345    if (active == 1)
346      return;
347    hier_barrier.zero();
348    if (active >= 2 && active <= 8) {
349      core_barrier_impl<T>::reset_shared(active, &hier_barrier);
350    } else {
351      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
352    }
353  }
354  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
355    KMP_DEBUG_ASSERT(tdata);
356    KMP_DEBUG_ASSERT(active > 0);
357    if (active == 1)
358      return;
359    if (active >= 2 && active <= 8) {
360      core_barrier_impl<T>::reset_private(active, tdata);
361    } else {
362      counter_barrier_impl<T>::reset_private(active, tdata);
363    }
364  }
365  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
366    KMP_DEBUG_ASSERT(tdata);
367    KMP_DEBUG_ASSERT(active > 0);
368    KMP_DEBUG_ASSERT(id >= 0 && id < active);
369    if (active == 1) {
370      tdata->index = 1 - tdata->index;
371      return;
372    }
373    if (active >= 2 && active <= 8) {
374      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
375    } else {
376      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
377    }
378  }
379
380  kmp_int32 get_next_status(kmp_uint64 index) const {
381    return hier_barrier.get_next_status(index);
382  }
383  T get_next_lb(kmp_uint64 index) const {
384    return hier_barrier.get_next_lb(index);
385  }
386  T get_next_ub(kmp_uint64 index) const {
387    return hier_barrier.get_next_ub(index);
388  }
389  ST get_next_st(kmp_uint64 index) const {
390    return hier_barrier.get_next_st(index);
391  }
392  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
393    return hier_barrier.get_next_sh(index);
394  }
395
396  kmp_int32 get_curr_status(kmp_uint64 index) const {
397    return hier_barrier.get_curr_status(index);
398  }
399  T get_curr_lb(kmp_uint64 index) const {
400    return hier_barrier.get_curr_lb(index);
401  }
402  T get_curr_ub(kmp_uint64 index) const {
403    return hier_barrier.get_curr_ub(index);
404  }
405  ST get_curr_st(kmp_uint64 index) const {
406    return hier_barrier.get_curr_st(index);
407  }
408  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
409    return hier_barrier.get_curr_sh(index);
410  }
411
412  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
413                            kmp_uint64 index) {
414    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
415  }
416  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
417    hier_barrier.set_next(lb, ub, st, status, index);
418  }
419  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
420  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
421  dispatch_private_info_template<T> *get_parent_pr() {
422    return &(hier_parent->hier_pr);
423  }
424
425  kmp_int32 is_active() const { return active; }
426  kmp_int32 get_num_active() const { return active; }
427#ifdef KMP_DEBUG
428  void print() {
429    KD_TRACE(
430        10,
431        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
432         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
433         hier_pr.u.p.tc));
434  }
435#endif
436};
437
438// Information regarding a single layer within the scheduling hierarchy
439template <typename T> struct kmp_hier_layer_info_t {
440  int num_active; // number of threads active in this level
441  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
442  enum sched_type sched; // static, dynamic, guided, etc.
443  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
444  int length; // length of the kmp_hier_top_unit_t array
445
446#ifdef KMP_DEBUG
447  // Print this layer's information
448  void print() {
449    const char *t = __kmp_get_hier_str(type);
450    KD_TRACE(
451        10,
452        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
453         "length:%d\n",
454         num_active, t, sched, chunk, length));
455  }
456#endif
457};
458
459/*
460 * Structure to implement entire hierarchy
461 *
462 * The hierarchy is kept as an array of arrays to represent the different
463 * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
464 * highest layer.
465 * Example:
466 * [ 2 ] -> [ L3 | L3 ]
467 * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
468 * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
469 * There is also an array of layer_info_t which has information regarding
470 * each layer
471 */
472template <typename T> struct kmp_hier_t {
473public:
474  typedef typename traits_t<T>::unsigned_t UT;
475  typedef typename traits_t<T>::signed_t ST;
476
477private:
478  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
479                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
480                   kmp_int32 previous_id, int hier_level) {
481    int status;
482    kmp_info_t *th = __kmp_threads[gtid];
483    auto parent = current->get_parent();
484    bool last_layer = (hier_level == get_num_layers() - 1);
485    KMP_DEBUG_ASSERT(th);
486    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
487    KMP_DEBUG_ASSERT(current);
488    KMP_DEBUG_ASSERT(hier_level >= 0);
489    KMP_DEBUG_ASSERT(hier_level < get_num_layers());
490    KMP_DEBUG_ASSERT(tdata);
491    KMP_DEBUG_ASSERT(parent || last_layer);
492
493    KD_TRACE(
494        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
495
496    T hier_id = (T)current->get_hier_id();
497    // Attempt to grab next iteration range for this level
498    if (previous_id == 0) {
499      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
500                   gtid, hier_level));
501      kmp_int32 contains_last;
502      T my_lb, my_ub;
503      ST my_st;
504      T nproc;
505      dispatch_shared_info_template<T> volatile *my_sh;
506      dispatch_private_info_template<T> *my_pr;
507      if (last_layer) {
508        // last layer below the very top uses the single shared buffer
509        // from the team struct.
510        KD_TRACE(10,
511                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
512                  gtid, hier_level));
513        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
514            th->th.th_dispatch->th_dispatch_sh_current);
515        nproc = (T)get_top_level_nproc();
516      } else {
517        // middle layers use the shared buffer inside the kmp_hier_top_unit_t
518        // structure
519        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
520                      gtid, hier_level));
521        my_sh =
522            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
523        nproc = (T)parent->get_num_active();
524      }
525      my_pr = current->get_my_pr();
526      KMP_DEBUG_ASSERT(my_sh);
527      KMP_DEBUG_ASSERT(my_pr);
528      enum sched_type schedule = get_sched(hier_level);
529      ST chunk = (ST)get_chunk(hier_level);
530      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
531                                                &contains_last, &my_lb, &my_ub,
532                                                &my_st, nproc, hier_id);
533      KD_TRACE(
534          10,
535          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536           gtid, hier_level, status));
537      // When no iterations are found (status == 0) and this is not the last
538      // layer, attempt to go up the hierarchy for more iterations
539      if (status == 0 && !last_layer) {
540        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
541                              &my_st, hier_id, hier_level + 1);
542        KD_TRACE(
543            10,
544            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
545             gtid, hier_level, status));
546        if (status == 1) {
547          kmp_hier_private_bdata_t *upper_tdata =
548              &(th->th.th_hier_bar_data[hier_level + 1]);
549          my_sh = parent->get_curr_sh(upper_tdata->index);
550          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
551                        gtid, hier_level));
552          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
553                                        parent->get_curr_lb(upper_tdata->index),
554                                        parent->get_curr_ub(upper_tdata->index),
555                                        parent->get_curr_st(upper_tdata->index),
556#if USE_ITT_BUILD
557                                        NULL,
558#endif
559                                        chunk, nproc, hier_id);
560          status = __kmp_dispatch_next_algorithm<T>(
561              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
562              hier_id);
563          if (!status) {
564            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
565                          "setting to 2!\n",
566                          gtid, hier_level));
567            status = 2;
568          }
569        }
570      }
571      current->set_next(my_lb, my_ub, my_st, status, tdata->index);
572      // Propagate whether a unit holds the actual global last iteration
573      // The contains_last attribute is sent downwards from the top to the
574      // bottom of the hierarchy via the contains_last flag inside the
575      // private dispatch buffers in the hierarchy's middle layers
576      if (contains_last) {
577        // If the next_algorithm() method returns 1 for p_last and it is the
578        // last layer or our parent contains the last serial chunk, then the
579        // chunk must contain the last serial iteration.
580        if (last_layer || parent->hier_pr.flags.contains_last) {
581          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
582                        "to contain last.\n",
583                        gtid, hier_level));
584          current->hier_pr.flags.contains_last = contains_last;
585        }
586        if (!current->hier_pr.flags.contains_last)
587          contains_last = FALSE;
588      }
589      if (p_last)
590        *p_last = contains_last;
591    } // if master thread of this unit
592    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
593      KD_TRACE(10,
594               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
595                gtid, hier_level));
596      current->barrier(previous_id, tdata);
597      KD_TRACE(10,
598               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
599                gtid, hier_level, current->get_curr_status(tdata->index)));
600    } else {
601      KMP_DEBUG_ASSERT(previous_id == 0);
602      return status;
603    }
604    return current->get_curr_status(tdata->index);
605  }
606
607public:
608  int top_level_nproc;
609  int num_layers;
610  bool valid;
611  int type_size;
612  kmp_hier_layer_info_t<T> *info;
613  kmp_hier_top_unit_t<T> **layers;
614  // Deallocate all memory from this hierarchy
615  void deallocate() {
616    for (int i = 0; i < num_layers; ++i)
617      if (layers[i] != NULL) {
618        __kmp_free(layers[i]);
619      }
620    if (layers != NULL) {
621      __kmp_free(layers);
622      layers = NULL;
623    }
624    if (info != NULL) {
625      __kmp_free(info);
626      info = NULL;
627    }
628    num_layers = 0;
629    valid = false;
630  }
631  // Returns true if reallocation is needed else false
632  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
633                          const enum sched_type *new_scheds,
634                          const ST *new_chunks) const {
635    if (!valid || layers == NULL || info == NULL ||
636        traits_t<T>::type_size != type_size || n != num_layers)
637      return true;
638    for (int i = 0; i < n; ++i) {
639      if (info[i].type != new_layers[i])
640        return true;
641      if (info[i].sched != new_scheds[i])
642        return true;
643      if (info[i].chunk != new_chunks[i])
644        return true;
645    }
646    return false;
647  }
648  // A single thread should call this function while the other threads wait
649  // create a new scheduling hierarchy consisting of new_layers, new_scheds
650  // and new_chunks.  These should come pre-sorted according to
651  // kmp_hier_layer_e value.  This function will try to avoid reallocation
652  // if it can
653  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
654                     const enum sched_type *new_scheds, const ST *new_chunks) {
655    top_level_nproc = 0;
656    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
657      KD_TRACE(
658          10,
659          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
660      for (int i = 0; i < n; ++i) {
661        info[i].num_active = 0;
662        for (int j = 0; j < get_length(i); ++j)
663          layers[i][j].active = 0;
664      }
665      return;
666    }
667    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
668    deallocate();
669    type_size = traits_t<T>::type_size;
670    num_layers = n;
671    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
672        sizeof(kmp_hier_layer_info_t<T>) * n);
673    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
674        sizeof(kmp_hier_top_unit_t<T> *) * n);
675    for (int i = 0; i < n; ++i) {
676      int max = 0;
677      kmp_hier_layer_e layer = new_layers[i];
678      info[i].num_active = 0;
679      info[i].type = layer;
680      info[i].sched = new_scheds[i];
681      info[i].chunk = new_chunks[i];
682      max = __kmp_hier_max_units[layer + 1];
683      if (max == 0) {
684        valid = false;
685        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
686        deallocate();
687        return;
688      }
689      info[i].length = max;
690      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
691          sizeof(kmp_hier_top_unit_t<T>) * max);
692      for (int j = 0; j < max; ++j) {
693        layers[i][j].active = 0;
694        layers[i][j].hier_pr.flags.use_hier = TRUE;
695      }
696    }
697    valid = true;
698  }
699  // loc - source file location
700  // gtid - global thread identifier
701  // pr - this thread's private dispatch buffer (corresponding with gtid)
702  // p_last (return value) - pointer to flag indicating this set of iterations
703  // contains last
704  //          iteration
705  // p_lb (return value) - lower bound for this chunk of iterations
706  // p_ub (return value) - upper bound for this chunk of iterations
707  // p_st (return value) - stride for this chunk of iterations
708  //
709  // Returns 1 if there are more iterations to perform, 0 otherwise
710  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
711           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
712    int status;
713    kmp_int32 contains_last = 0;
714    kmp_info_t *th = __kmp_threads[gtid];
715    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
716    auto parent = pr->get_parent();
717    KMP_DEBUG_ASSERT(parent);
718    KMP_DEBUG_ASSERT(th);
719    KMP_DEBUG_ASSERT(tdata);
720    KMP_DEBUG_ASSERT(parent);
721    T nproc = (T)parent->get_num_active();
722    T unit_id = (T)pr->get_hier_id();
723    KD_TRACE(
724        10,
725        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
726         gtid, nproc, unit_id));
727    // Handthreading implementation
728    // Each iteration is performed by all threads on last unit (typically
729    // cores/tiles)
730    // e.g., threads 0,1,2,3 all execute iteration 0
731    //       threads 0,1,2,3 all execute iteration 1
732    //       threads 4,5,6,7 all execute iteration 2
733    //       threads 4,5,6,7 all execute iteration 3
734    //       ... etc.
735    if (__kmp_dispatch_hand_threading) {
736      KD_TRACE(10,
737               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
738                gtid));
739      if (unit_id == 0) {
740        // For hand threading, the sh buffer on the lowest level is only ever
741        // modified and read by the master thread on that level.  Because of
742        // this, we can always use the first sh buffer.
743        auto sh = &(parent->hier_barrier.sh[0]);
744        KMP_DEBUG_ASSERT(sh);
745        status = __kmp_dispatch_next_algorithm<T>(
746            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
747        if (!status) {
748          bool done = false;
749          while (!done) {
750            done = true;
751            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
752                                  p_st, unit_id, 0);
753            if (status == 1) {
754              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
755                                            parent->get_next_lb(tdata->index),
756                                            parent->get_next_ub(tdata->index),
757                                            parent->get_next_st(tdata->index),
758#if USE_ITT_BUILD
759                                            NULL,
760#endif
761                                            pr->u.p.parm1, nproc, unit_id);
762              sh->u.s.iteration = 0;
763              status = __kmp_dispatch_next_algorithm<T>(
764                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
765                  unit_id);
766              if (!status) {
767                KD_TRACE(10,
768                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
769                          "after next_pr_sh()"
770                          "trying again.\n",
771                          gtid));
772                done = false;
773              }
774            } else if (status == 2) {
775              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
776                            "trying again.\n",
777                            gtid));
778              done = false;
779            }
780          }
781        }
782        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
783      } // if master thread of lowest unit level
784      parent->barrier(pr->get_hier_id(), tdata);
785      if (unit_id != 0) {
786        *p_lb = parent->get_curr_lb(tdata->index);
787        *p_ub = parent->get_curr_ub(tdata->index);
788        *p_st = parent->get_curr_st(tdata->index);
789        status = parent->get_curr_status(tdata->index);
790      }
791    } else {
792      // Normal implementation
793      // Each thread grabs an iteration chunk and executes it (no cooperation)
794      auto sh = parent->get_curr_sh(tdata->index);
795      KMP_DEBUG_ASSERT(sh);
796      status = __kmp_dispatch_next_algorithm<T>(
797          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
798      KD_TRACE(10,
799               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
800                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
801                gtid, status, contains_last, *p_lb, *p_ub, *p_st));
802      if (!status) {
803        bool done = false;
804        while (!done) {
805          done = true;
806          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
807                                p_st, unit_id, 0);
808          if (status == 1) {
809            sh = parent->get_curr_sh(tdata->index);
810            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
811                                          parent->get_curr_lb(tdata->index),
812                                          parent->get_curr_ub(tdata->index),
813                                          parent->get_curr_st(tdata->index),
814#if USE_ITT_BUILD
815                                          NULL,
816#endif
817                                          pr->u.p.parm1, nproc, unit_id);
818            status = __kmp_dispatch_next_algorithm<T>(
819                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
820            if (!status) {
821              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
822                            "after next_pr_sh()"
823                            "trying again.\n",
824                            gtid));
825              done = false;
826            }
827          } else if (status == 2) {
828            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
829                          "trying again.\n",
830                          gtid));
831            done = false;
832          }
833        }
834      }
835    }
836    if (contains_last && !parent->hier_pr.flags.contains_last) {
837      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
838                    "contains_last to FALSE\n",
839                    gtid));
840      contains_last = FALSE;
841    }
842    if (p_last)
843      *p_last = contains_last;
844    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
845                  status));
846    return status;
847  }
848  // These functions probe the layer info structure
849  // Returns the type of topology unit given level
850  kmp_hier_layer_e get_type(int level) const {
851    KMP_DEBUG_ASSERT(level >= 0);
852    KMP_DEBUG_ASSERT(level < num_layers);
853    return info[level].type;
854  }
855  // Returns the schedule type at given level
856  enum sched_type get_sched(int level) const {
857    KMP_DEBUG_ASSERT(level >= 0);
858    KMP_DEBUG_ASSERT(level < num_layers);
859    return info[level].sched;
860  }
861  // Returns the chunk size at given level
862  ST get_chunk(int level) const {
863    KMP_DEBUG_ASSERT(level >= 0);
864    KMP_DEBUG_ASSERT(level < num_layers);
865    return info[level].chunk;
866  }
867  // Returns the number of active threads at given level
868  int get_num_active(int level) const {
869    KMP_DEBUG_ASSERT(level >= 0);
870    KMP_DEBUG_ASSERT(level < num_layers);
871    return info[level].num_active;
872  }
873  // Returns the length of topology unit array at given level
874  int get_length(int level) const {
875    KMP_DEBUG_ASSERT(level >= 0);
876    KMP_DEBUG_ASSERT(level < num_layers);
877    return info[level].length;
878  }
879  // Returns the topology unit given the level and index
880  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
881    KMP_DEBUG_ASSERT(level >= 0);
882    KMP_DEBUG_ASSERT(level < num_layers);
883    KMP_DEBUG_ASSERT(index >= 0);
884    KMP_DEBUG_ASSERT(index < get_length(level));
885    return &(layers[level][index]);
886  }
887  // Returns the number of layers in the hierarchy
888  int get_num_layers() const { return num_layers; }
889  // Returns the number of threads in the top layer
890  // This is necessary because we don't store a topology unit as
891  // the very top level and the scheduling algorithms need this information
892  int get_top_level_nproc() const { return top_level_nproc; }
893  // Return whether this hierarchy is valid or not
894  bool is_valid() const { return valid; }
895#ifdef KMP_DEBUG
896  // Print the hierarchy
897  void print() {
898    KD_TRACE(10, ("kmp_hier_t:\n"));
899    for (int i = num_layers - 1; i >= 0; --i) {
900      KD_TRACE(10, ("Info[%d] = ", i));
901      info[i].print();
902    }
903    for (int i = num_layers - 1; i >= 0; --i) {
904      KD_TRACE(10, ("Layer[%d] =\n", i));
905      for (int j = 0; j < info[i].length; ++j) {
906        layers[i][j].print();
907      }
908    }
909  }
910#endif
911};
912
913template <typename T>
914void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
915                                   kmp_hier_layer_e *new_layers,
916                                   enum sched_type *new_scheds,
917                                   typename traits_t<T>::signed_t *new_chunks,
918                                   T lb, T ub,
919                                   typename traits_t<T>::signed_t st) {
920  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
921  int my_buffer_index;
922  kmp_info_t *th;
923  kmp_team_t *team;
924  dispatch_private_info_template<T> *pr;
925  dispatch_shared_info_template<T> volatile *sh;
926  gtid = __kmp_entry_gtid();
927  tid = __kmp_tid_from_gtid(gtid);
928#ifdef KMP_DEBUG
929  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
930                gtid, n));
931  for (int i = 0; i < n; ++i) {
932    const char *layer = __kmp_get_hier_str(new_layers[i]);
933    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
934                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
935                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
936  }
937#endif // KMP_DEBUG
938  KMP_DEBUG_ASSERT(n > 0);
939  KMP_DEBUG_ASSERT(new_layers);
940  KMP_DEBUG_ASSERT(new_scheds);
941  KMP_DEBUG_ASSERT(new_chunks);
942  if (!TCR_4(__kmp_init_parallel))
943    __kmp_parallel_initialize();
944  __kmp_resume_if_soft_paused();
945
946  th = __kmp_threads[gtid];
947  team = th->th.th_team;
948  active = !team->t.t_serialized;
949  th->th.th_ident = loc;
950  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
951  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
952                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
953  my_buffer_index = th->th.th_dispatch->th_disp_index;
954  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
955      &th->th.th_dispatch
956           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
957  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
958      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
959  if (!active) {
960    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
961                  "Using normal dispatch functions.\n",
962                  gtid));
963    KMP_DEBUG_ASSERT(pr);
964    pr->flags.use_hier = FALSE;
965    pr->flags.contains_last = FALSE;
966    return;
967  }
968  KMP_DEBUG_ASSERT(pr);
969  KMP_DEBUG_ASSERT(sh);
970  pr->flags.use_hier = TRUE;
971  pr->u.p.tc = 0;
972  // Have master allocate the hierarchy
973  if (__kmp_tid_from_gtid(gtid) == 0) {
974    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
975                  "hierarchy\n",
976                  gtid, pr, sh));
977    if (sh->hier == NULL) {
978      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
979    }
980    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
981    sh->u.s.iteration = 0;
982  }
983  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
984  // Check to make sure the hierarchy is valid
985  kmp_hier_t<T> *hier = sh->hier;
986  if (!sh->hier->is_valid()) {
987    pr->flags.use_hier = FALSE;
988    return;
989  }
990  // Have threads allocate their thread-private barrier data if it hasn't
991  // already been allocated
992  if (th->th.th_hier_bar_data == NULL) {
993    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
994        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
995  }
996  // Have threads "register" themselves by modifiying the active count for each
997  // level they are involved in. The active count will act as nthreads for that
998  // level regarding the scheduling algorithms
999  for (int i = 0; i < n; ++i) {
1000    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1001    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1002    // Setup the thread's private dispatch buffer's hierarchy pointers
1003    if (i == 0)
1004      pr->hier_parent = my_unit;
1005    // If this unit is already active, then increment active count and wait
1006    if (my_unit->is_active()) {
1007      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1008                    "is already active (%d)\n",
1009                    gtid, my_unit, my_unit->active));
1010      KMP_TEST_THEN_INC32(&(my_unit->active));
1011      break;
1012    }
1013    // Flag that this unit is active
1014    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
1015      // Do not setup parent pointer for top level unit since it has no parent
1016      if (i < n - 1) {
1017        // Setup middle layer pointers to parents
1018        my_unit->get_my_pr()->hier_id =
1019            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1020                                                 hier->get_type(i + 1));
1021        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1022        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1023      } else {
1024        // Setup top layer information (no parent pointers are set)
1025        my_unit->get_my_pr()->hier_id =
1026            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1027                                                 kmp_hier_layer_e::LAYER_LOOP);
1028        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1029        my_unit->hier_parent = nullptr;
1030      }
1031      // Set trip count to 0 so that next() operation will initially climb up
1032      // the hierarchy to get more iterations (early exit in next() for tc == 0)
1033      my_unit->get_my_pr()->u.p.tc = 0;
1034      // Increment this layer's number of active units
1035      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1036      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1037                    "incrementing num_active\n",
1038                    gtid, my_unit));
1039    } else {
1040      KMP_TEST_THEN_INC32(&(my_unit->active));
1041      break;
1042    }
1043  }
1044  // Set this thread's id
1045  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1046      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1047  pr->hier_id = tid % num_threads_per_layer1;
1048  // For oversubscribed threads, increment their index within the lowest unit
1049  // This is done to prevent having two or more threads with id 0, id 1, etc.
1050  if (tid >= num_hw_threads)
1051    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1052  KD_TRACE(
1053      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1054           gtid, pr->hier_id));
1055
1056  pr->flags.contains_last = FALSE;
1057  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1058
1059  // Now that the number of active threads at each level is determined,
1060  // the barrier data for each unit can be initialized and the last layer's
1061  // loop information can be initialized.
1062  int prev_id = pr->get_hier_id();
1063  for (int i = 0; i < n; ++i) {
1064    if (prev_id != 0)
1065      break;
1066    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1067    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1068    // Only master threads of this unit within the hierarchy do initialization
1069    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1070                  gtid, i));
1071    my_unit->reset_shared_barrier();
1072    my_unit->hier_pr.flags.contains_last = FALSE;
1073    // Last layer, initialize the private buffers with entire loop information
1074    // Now the next next_algorithm() call will get the first chunk of
1075    // iterations properly
1076    if (i == n - 1) {
1077      __kmp_dispatch_init_algorithm<T>(
1078          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1079#if USE_ITT_BUILD
1080          NULL,
1081#endif
1082          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1083    }
1084    prev_id = my_unit->get_hier_id();
1085  }
1086  // Initialize each layer of the thread's private barrier data
1087  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1088  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1089    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1090    unit->reset_private_barrier(tdata);
1091  }
1092  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1093
1094#ifdef KMP_DEBUG
1095  if (__kmp_tid_from_gtid(gtid) == 0) {
1096    for (int i = 0; i < n; ++i) {
1097      KD_TRACE(10,
1098               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1099                gtid, i, hier->get_num_active(i)));
1100    }
1101    hier->print();
1102  }
1103  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1104#endif // KMP_DEBUG
1105}
1106#endif
1107