#! /usr/perl5/bin/perl
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
#

#
# Pg.pm provides object-oriented interface to the Solaris
# Processor Group kstats
#
# See comments in the end
#

package Sun::Solaris::Pg;

use strict;
use warnings;
use Sun::Solaris::Kstat;
use Carp;
use Errno;
use List::Util qw(max sum);

our $VERSION = '1.1';

#
# Currently the OS does not have the root PG and PGs constitute a forest of
# small trees. This module gathers all such trees under one root with ID zero.
# If the root is present already, we do not use faked root.
#

my $ROOT_ID = 0;

#
# PG_NO_PARENT means that kstats have PG parent ID and it is set to -1
# PG_PARENT_UNDEF means that kstats have no PG parent ID
#
use constant {
	PG_NO_PARENT	=> -1,
	PG_PARENT_UNDEF => -2,
};

#
# Sorting order between different sharing relationships. This order is used to
# break ties between PGs with the same number of CPUs. If there are two PGs with
# the same set of CPUs, the one with the higher weight will be the parent of the
# one with the lower weight.
#
my %relationships_order = (
			   'CPU_PM_Idle_Power_Domain' => 1,
			   'Integer_Pipeline' => 2,
			   'Cache' => 3,
			   'CPU_PM_Active_Power_Domain' => 4,
			   'Floating_Point_Unit' => 5,
			   'Data_Pipe_to_memory' => 6,
			   'Memory' => 7,
			   'Socket' => 8,
			   'System' => 9,
			  );

#
# Object interface to the library. These are methods that can be used by the
# module user.
#

#
# Create a new object representing PG
# All the heavy lifting is performed by _init function.
# This function performs all the Perl blessing magic.
#
# The new() method accepts arguments in the form of a hash. The following
# subarguments are supported:
#
#   -cpudata	# Collect per-CPU data from kstats if this is T
#   -tags	# Match PGs to physical relationships if this is T
#   -swload	# Collect software CPU load if this is T
#   -retry	# how many times to retry PG initialization when it fails
#   -delay # Delay in seconds between retries
#
# The arguments are passed to _init().
#
sub new
{
	my $class = shift;
	my %args = @_;
	my $retry_count = $args{-retry} || 0;
	my $retry_delay = $args{-delay} || 1;

	my $self =  _init(@_);

	#
	# If PG initialization fails with EAGAIN error and the caller requested
	# retries, retry initialization.
	#
	for (; !$self && ($! == &Errno::EAGAIN) && $retry_count;
	     $retry_count--) {
		select(undef,undef,undef, $retry_delay);
		$self = _init(@_);
	}

	if ($self) {
		bless($self, $class) if defined($class);
		bless($self) unless defined($class);
	}

	return ($self);
}

#
# Functions below use internal function _pg_get which returns PG hash reference
# corresponding to PG ID specified or 'undef' if the PG can't be found.
#

#
# All methods return 'undef' in scalar context and an empty list in list
# context when unrecoverable errors are detected.
#

#
# Return the root ID of PG hierarchy
#
sub root
{
	scalar @_ == 1 or _usage("root(cookie)");
	my $self = shift;

	return unless $self->{PGTREE};

	return ($ROOT_ID);
}

#
# Return list of all pgs numerically sorted In scalar context return number of
# PGs
#
sub all
{
	scalar @_ == 1 or _usage("all(cookie)");
	my $self = shift;
	my $pgtree =  $self->{PGTREE} or return;
	my @ids = keys(%{$pgtree});

	return (wantarray() ? _nsort(@ids) : scalar @ids);
}

#
# Return list of all pgs by walking the tree depth first.
#
sub all_depth_first
{
	scalar @_ == 1 or _usage("all_depth_first(cookie)");
	my $self = shift;

	_walk_depth_first($self, $self->root());
}

#
# Return list of all pgs by walking the tree breadth first.
#
sub all_breadth_first
{
	scalar @_ == 1 or _usage("all_breadth_first(cookie)");
	my $self = shift;

	_walk_breadth_first($self, $self->root());
}

#
# Return list of CPUs in the PG specified
# CPUs returned are numerically sorted
# In scalar context return number of CPUs
#
sub cpus
{
	scalar @_ == 2 or _usage("cpus(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;
	my @cpus =  @{$pg->{cpus}};

	return (wantarray() ? _nsort(@cpus) : _collapse(@cpus));
}

#
# Return a parent for a given PG
# Returns undef if there is no parent
#
sub parent
{
	scalar @_ == 2 or _usage("parent(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;
	my $parent = $pg->{parent};

	return (defined($parent) && $parent >= 0 ? $parent : undef);
}

#
# Return list of children for a given PG
# In scalar context return list of children
#
sub children
{
	scalar @_ == 2 or _usage("children(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;

	my $children = $pg->{children} or return;
	my @children = @{$children};

	return (wantarray() ? _nsort(@children) : scalar @children);
}

#
# Return sharing name for the PG
#
sub sh_name
{
	scalar @_ == 2 or _usage("sh_name(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;
	return ($pg->{sh_name});
}

#
# Return T if specified PG ID is a leaf PG
#
sub is_leaf
{
	scalar @_ == 2 or _usage("is_leaf(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;
	return ($pg->{is_leaf});
}

#
# Return leaf PGs
#
sub leaves
{
	scalar @_ == 1 or _usage("leaves(cookie, pg)");

	my $self = shift;

	return (grep { is_leaf($self, $_) } $self->all());
}

#
# Update varying data in the snapshot
#
sub update
{
	scalar @_ == 1 or _usage("update(cookie)");

	my $self = shift;
	my $ks = $self->{KSTAT};

	$ks->update();

	my $pgtree = $self->{PGTREE};
	my $pg_info = $ks->{$self->{PG_MODULE}};

	#
	# Walk PG kstats and copy updated data from kstats to the snapshot
	#
	foreach my $id (keys %$pg_info) {
		my $pg = $pgtree->{$id} or next;

		my $pg_ks = _kstat_get_pg($pg_info, $id,
					  $self->{USE_OLD_KSTATS});
		return unless $pg_ks;

		#
		# Update PG from kstats
		#
		$pg->{util} = $pg_ks->{hw_util};
		$pg->{current_rate} = $pg_ks->{hw_util_rate};
		$pg->{util_rate_max} = $pg_ks->{hw_util_rate_max};
		$pg->{util_time_running} = $pg_ks->{hw_util_time_running};
		$pg->{util_time_stopped} = $pg_ks->{hw_util_time_stopped};
		$pg->{snaptime} = $pg_ks->{snaptime};
		$pg->{generation} = $pg_ks->{generation};
	}

	#
	# Update software load for each CPU
	#
	$self->{CPU_LOAD} = _get_sw_cpu_load($ks);

	#
	# Get hardware load per CPU
	#
	if ($self->{GET_CPU_DATA}) {
		_get_hw_cpu_load($self);
	}

	return (1);
}

#
# Return list of physical tags for the given PG
#
sub tags
{
	scalar @_ == 2 or _usage("tags(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;

	my $tags = $pg->{tags} or return;

	my @tags = _uniq(@{$tags});

	return (wantarray() ? @tags : join (',', @tags));
}

#
# Return list of sharing relationships in the snapshot Relationships are sorted
# by the level in the hierarchy If any PGs are given on the command line, only
# return sharing relationships for given PGs, but still keep them sorted.
#
sub sharing_relationships
{
	scalar @_ or _usage("sharing_relationships(cookie, [pg, ...])");

	my $self = shift;
	my @pgs = $self->all_breadth_first();

	if (scalar @_ > 0) {
		#
		# Caller specified PGs, remove any PGs not in caller's list
		#
		my %seen;
		map { $seen{$_} = 1 } @_;

		# Remove any PGs not provided by user
		@pgs = grep { $seen{$_} } @pgs;
	}

	return (_uniq(map { $self->sh_name($_) } @pgs));
}

#
# Return PG generation number. If PG is specified in the argument, return its
# generation, otherwise return snapshot generation.
# Snapshot generation is calculated as the total of PG generations
#
sub generation
{
	(scalar @_ == 1 || scalar @_ == 2) or _usage("generation(cookie, [pg])");
	my $self = shift;

	if (scalar @_ == 0) {
		my @generations = map { $_->{generation} }
				  values %{$self->{PGTREE}};
		return (sum(@generations));

	} else {
		my $id = shift;
		my $pg = _pg_get($self, $id) or return;
		return ($pg->{generation});
	}
}

#
# Return level of PG in the tree, starting from root.
# PG level is cached in the $pg->{level} field.
#
sub level
{
	scalar @_ == 2 or _usage("level(cookie, pg)");
	my $self = shift;
	my $pgid = shift;
	my $pg = _pg_get($self, $pgid) or return;

	return $pg->{level} if defined($pg->{level});

	$pg->{level} = 0;

	my $parent = _pg_get($self, $pg->{parent});
	while ($parent) {
		$pg->{level}++;
		$parent = _pg_get($self, $parent->{parent});
	}

	return ($pg->{level});
}

#
# Return T if PG supports utilization We assume that utilization is supported by
# PG if it shows any non-zero time in util_time_running. It is possible that the
# same condition may be caused by cpustat(1) running ever since PG was created,
# but there is not much we can do about it.
#
sub has_utilization
{
	scalar @_ == 2 or _usage("has_utilization(cookie, pg)");
	my $pg = _pg_get(shift, shift) or return;

	return ($pg->{util_time_running} != 0);
}


#
# Return utilization for the PG
# Utilization is a difference in utilization value between two snapshots.
# We can only compare utilization between PGs having the same generation ID.
#
sub utilization
{
	scalar @_ == 3 or _usage("utilization(cookie, cookie1, pg");
	my $c1 = shift;
	my $c2 = shift;
	my $id = shift;

	#
	# Since we have two cookies, update capacity in both
	#
	_capacity_update($c1, $c2, $id);

	my $pg1 = _pg_get($c1, $id) or return;
	my $pg2 = _pg_get($c2, $id) or return;

	#
	# Nothing to return if one of the utilizations wasn't measured
	#
	return unless ($pg1->{util_time_running} && $pg2->{util_time_running});

	#
	# Verify generation IDs
	#
	return unless $pg1->{generation} eq $pg2->{generation};
	my $u1 = $pg1->{util};
	my $u2 = $pg2->{util};
	return unless defined ($u1) && defined ($u2);

	return (abs($u2 - $u1));
}

#
# Return an estimate of PG capacity Capacity is calculated as the maximum of
# observed utilization expressed in units per second or maximum CPU frequency
# for all CPUs.
#
# We store capacity per sharing relationship, assuming that the same sharing has
# the same capacity. This may not be true for heterogeneous systems.
#
sub capacity
{
	scalar @_ == 2 or _usage("capacity(cookie, pg");
	my $self = shift;
	my $pgid = shift;
	my $pg = _pg_get($self, $pgid) or return;
	my $shname = $pg->{sh_name} or return;

	return (max($self->{MAX_FREQUENCY}, $self->{CAPACITY}->{$shname}));
}

#
# Return accuracy of utilization calculation between two snapshots The accuracy
# is determined based on the total time spent running and not running the
# counters. If T1 is the time counters were running during the period and T2 is
# the time they were turned off, the accuracy is T1 / (T1 + T2), expressed in
# percentages.
#
sub accuracy
{
	scalar @_ == 3 or _usage("accuracy(cookie, cookie1, pg)");
	my $c1 = shift;
	my $c2 = shift;
	my $id = shift;
	my $trun;
	my $tstop;

	my $pg1 = _pg_get($c1, $id) or return;
	my $pg2 = _pg_get($c2, $id) or return;

	# Both PGs should have the same generation
	return unless $pg1->{generation} eq $pg2->{generation};

	#
	# Get time spent with running and stopped counters
	#
	$trun = abs($pg2->{util_time_running} -
		    $pg1->{util_time_running});
	$tstop = abs($pg2->{util_time_stopped} -
		     $pg1->{util_time_stopped});

	my $total = $trun + $tstop;

	#
	# Calculate accuracy as percentage
	#
	my $accuracy = $total ? ($trun * 100) / $total : 0;
	$accuracy = int($accuracy + 0.5);
	$accuracy = 100 if $accuracy > 100;
	return ($accuracy);
}

#
# Return time difference in seconds between two snapshots
#
sub tdelta
{
	scalar @_ == 3 or _usage("tdelta(cookie, cookie1, pg)");
	my $c1 = shift;
	my $c2 = shift;
	my $id = shift;

	my $pg1 = _pg_get($c1, $id) or return;
	my $pg2 = _pg_get($c2, $id) or return;

	return unless $pg1->{generation} eq $pg2->{generation};

	my $t1 = $pg1->{snaptime};
	my $t2 = $pg2->{snaptime};
	my $delta = abs($t1 - $t2);
	return ($delta);
}

#
# Return software utilization between two snapshots
# In scalar context return software load as percentage.
# In list context return a list (USER, SYSTEM, IDLE, SWLOAD)
# All loads are returned as percentages
#
sub sw_utilization
{
	scalar @_ == 3 or _usage("tdelta(cookie, cookie1, pg)");

	my $c1 = shift;
	my $c2 = shift;
	my $id = shift;

	my $pg1 = _pg_get($c1, $id) or return;
	my $pg2 = _pg_get($c2, $id) or return;

	return unless $pg1->{generation} eq $pg2->{generation};

	my @cpus = $c1->cpus($id);

	my $load1 = $c1->{CPU_LOAD};
	my $load2 = $c2->{CPU_LOAD};

	my $idle = 0;
	my $user = 0;
	my $sys = 0;
	my $total = 0;
	my $swload = 0;

	foreach my $cpu (@cpus) {
		my $ld1 = $load1->{$cpu};
		my $ld2 = $load2->{$cpu};
		next unless $ld1 && $ld2;

		$idle += $ld2->{cpu_idle} - $ld1->{cpu_idle};
		$user += $ld2->{cpu_user} - $ld1->{cpu_user};
		$sys  += $ld2->{cpu_sys}  - $ld1->{cpu_sys};
	}

	$total = $idle + $user + $sys;

	# Prevent division by zero
	$total = 1 unless $total;

	$swload = ($user + $sys) * 100 / $total;
	$idle   = $idle * 100 / $total;
	$user   = $user * 100 / $total;
	$sys    = $sys  * 100 / $total;

	return (wantarray() ? ($user, $sys, $idle, $swload) : $swload);
}

#
# Return utilization for the PG for a given CPU
# Utilization is a difference in utilization value between two snapshots.
# We can only compare utilization between PGs having the same generation ID.
#
sub cpu_utilization
{
	scalar @_ == 4 or _usage("utilization(cookie, cookie1, pg, cpu");
	my $c1 = shift;
	my $c2 = shift;
	my $id = shift;
	my $cpu = shift;

	my $idle = 0;
	my $user = 0;
	my $sys = 0;
	my $swtotal = 0;
	my $swload = 0;

	#
	# Since we have two cookies, update capacity in both
	#
	_capacity_update($c1, $c2, $id);

	my $pg1 = _pg_get($c1, $id) or return;
	my $pg2 = _pg_get($c2, $id) or return;

	#
	# Nothing to return if one of the utilizations wasn't measured
	#
	return unless ($pg1->{util_time_running} && $pg2->{util_time_running});

	#
	# Nothing to return if CPU data is missing
	#
	return unless $pg1->{cpudata} && $pg2->{cpudata};

	#
	# Verify generation IDs
	#
	return unless $pg1->{generation} eq $pg2->{generation};

	#
	# Get data for the given CPU
	#
	my $cpudata1 = $pg1->{cpudata}->{$cpu};
	my $cpudata2 = $pg2->{cpudata}->{$cpu};

	return unless $cpudata1 && $cpudata2;

	return unless $cpudata1->{generation} == $cpudata2->{generation};

	my $u1 = $cpudata1->{util};
	my $u2 = $cpudata2->{util};
	return unless defined ($u1) && defined ($u2);
	my $hw_utilization = abs ($u1 - $u2);

	#
	# Get time spent with running and stopped counters
	#
	my $trun = abs($cpudata1->{util_time_running} -
		       $cpudata2->{util_time_running});
	my $tstop = abs($cpudata1->{util_time_stopped} -
			$cpudata2->{util_time_stopped});

	my $total = $trun + $tstop;

	#
	# Calculate accuracy as percentage
	#
	my $accuracy = $total ? ($trun * 100) / $total : 0;
	$accuracy = int($accuracy + 0.5);
	$accuracy = 100 if $accuracy > 100;

	my $t1 = $cpudata1->{snaptime};
	my $t2 = $cpudata2->{snaptime};
	my $tdelta = abs ($t1 - $t2);

	my $shname = $pg2->{sh_name} or return;
	my $capacity = max($c2->{MAX_FREQUENCY}, $c2->{CAPACITY}->{$shname});
	my $utilization = $hw_utilization / $tdelta;
	$capacity = $utilization unless $capacity;
	$utilization /= $capacity;
	$utilization *= 100;

	my $ld1 = $c1->{CPU_LOAD}->{$cpu};
	my $ld2 = $c2->{CPU_LOAD}->{$cpu};

	if ($ld1 && $ld2) {
		$idle = $ld2->{cpu_idle} - $ld1->{cpu_idle};
		$user = $ld2->{cpu_user} - $ld1->{cpu_user};
		$sys  = $ld2->{cpu_sys}  - $ld1->{cpu_sys};

		$swtotal = $idle + $user + $sys;

		# Prevent division by zero
		$swtotal = 1 unless $swtotal;

		$swload = ($user + $sys) * 100 / $swtotal;
		$idle   = $idle * 100 / $swtotal;
		$user   = $user * 100 / $swtotal;
		$sys    = $sys  * 100 / $swtotal;
	}

	return (wantarray() ?
		($utilization, $accuracy, $hw_utilization,
		 $swload, $user, $sys, $idle) :
		$utilization);
}

#
# online_cpus(kstat)
# Return list of on-line CPUs
#
sub online_cpus
{
	scalar @_ == 1 or _usage("online_cpus(cookie)");

	my $self = shift or return;
	my $ks = $self->{KSTAT} or return;

	my $cpu_info = $ks->{cpu_info} or return;

	my @cpus = grep {
		my $cp = $cpu_info->{$_}->{"cpu_info$_"};
		my $state = $cp->{state};
		$state eq 'on-line' || $state eq 'no-intr';
	} keys %{$cpu_info};

	return (wantarray() ? @cpus : _nsort(@cpus));
}

#
# Support methods
#
# The following methods are not PG specific but are generally useful for PG
# interface consumers
#

#
# Sort the list numerically
#
sub nsort
{
	scalar @_ > 0 or _usage("nsort(cookie, val, ...)");
	shift;

	return (_nsort(@_));
}

#
# Return the input list with duplicates removed.
# Should be used in list context
#
sub uniq
{
	scalar @_ > 0 or _usage("uniq(cookie, val, ...)");
	shift;

	return (_uniq(@_));
}

#
# Sort list numerically and remove duplicates
# Should be called in list context
#
sub uniqsort
{
	scalar @_ > 0 or _usage("uniqsort(cookie, val, ...)");
	shift;

	return (_uniqsort(@_));
}


#
# Expand all arguments and present them as a numerically sorted list
# x,y is expanded as (x y)
# 1-3 ranges are expandes as (1 2 3)
#
sub expand
{
	scalar @_ > 0 or _usage("expand(cookie, val, ...)");
	shift;

	return (_uniqsort(map { _expand($_) } @_));
}

#
# Consolidate consecutive ids as start-end
# Input: list of ids
# Output: string with space-sepated cpu values with ranges
#   collapsed as x-y
#
sub id_collapse
{
	scalar @_ > 0 or _usage("collapse(cookie, val, ...)");
	shift;

	return _collapse(@_);
}

#
# Return elements of the second list not present in the first list. Both lists
# are passed by reference.
#
sub set_subtract
{
	scalar @_ == 3 or _usage("set_subtract(cookie, left, right)");
	shift;

	return (_set_subtract(@_));
}

#
# Return the intersection of two lists passed by reference
# Convert the first list to a hash with seen entries marked as 1-values
# Then grep only elements present in the first list from the second list.
# As a little optimization, use the shorter list to build a hash.
#
sub intersect
{
	scalar @_ == 3 or _usage("intersect(cookie, left, right)");
	shift;

	return (_set_intersect(@_));
}

#
# Return elements of the second list not present in the first list. Both lists
# are passed by reference.
#
sub _set_subtract
{
	my ($left, $right) = @_;
	my %seen;	# Set to 1 for everything in the first list
	# Create a hash indexed by elements in @left with ones as a value.
	map { $seen{$_} = 1 } @$left;
	# Find members of @right present in @left
	return (grep { ! $seen{$_} } @$right);
}

#
# END OF PUBLIC INTERFACE
#

#
# INTERNAL FUNCTIONS
#

#
# _usage(): print error message and terminate the program.
#
sub _usage
{
	my $msg = shift;
	Carp::croak "Usage: Sun::Solaris::Pg::$msg";
}

#
# Sort the list numerically
# Should be called in list context
#
sub _nsort
{
	return (sort { $a <=> $b } @_);
}

#
# Return the input list with duplicates removed.
# Should be used in list context
#
sub _uniq
{
	my %seen;
	return (grep { ++$seen{$_} == 1 } @_);
}

#
# Sort list numerically and remove duplicates
# Should be called in list context
#
sub _uniqsort
{
	return (sort { $a <=> $b } _uniq(@_));
}

# Get PG from the snapshot by id
sub _pg_get
{
	my $self = shift;
	my $pgid = shift;

	return unless defined $pgid;
	my $pgtree = $self->{PGTREE} or return;

	return ($pgtree->{$pgid});
}

#
# Copy data from kstat representation to our representation
# Arguments:
#   PG kstat
#   Reference to the list of CPUs.
# Any CPUs in the PG kstat not present in the CPU list are ignored.
#
sub _pg_create_from_kstat
{
	my $pg_ks = shift;
	my $all_cpus = shift;
	my %all_cpus;
	my $pg = ();

	#
	# Mark CPUs available
	#
	map { $all_cpus{$_}++ } @$all_cpus;

	return unless $pg_ks;

	#
	# Convert CPU list in the kstat from x-y,z form to the proper list
	#
	my @cpus = _expand($pg_ks->{cpus});

	#
	# Remove any CPUs not present in the arguments
	#
	@cpus = grep { $all_cpus{$_} } @cpus;

	#
	# Do not create PG unless it has any CPUs
	#
	return unless scalar @cpus;

	#
	# Copy data to the $pg structure
	#
	$pg->{ncpus} = scalar @cpus;
	$pg->{cpus} = \@cpus;
	$pg->{id} = defined($pg_ks->{pg_id}) ? $pg_ks->{pg_id} : $pg_ks->{id};
	$pg->{util} = $pg_ks->{hw_util};
	$pg->{current_rate} = $pg_ks->{hw_util_rate};
	$pg->{util_rate_max} = $pg_ks->{hw_util_rate_max};
	$pg->{util_time_running} = $pg_ks->{hw_util_time_running};
	$pg->{util_time_stopped} = $pg_ks->{hw_util_time_stopped};
	$pg->{snaptime} = $pg_ks->{snaptime};
	$pg->{generation} = $pg_ks->{generation};
	$pg->{sh_name} = $pg_ks->{relationship} || $pg_ks->{sharing_relation};
	$pg->{parent} = $pg_ks->{parent_pg_id};
	$pg->{parent} = PG_PARENT_UNDEF unless defined $pg->{parent};
	#
	# Replace spaces with underscores in sharing names
	#
	$pg->{sh_name} =~ s/ /_/g;
	$pg->{is_leaf} = 1;

	return $pg;
}

#
# Create fake root PG with all CPUs
# Arguments: list of CPUs
#
sub _pg_create_root
{
	my $pg = ();
	my @cpus = @_;

	$pg->{id} = $ROOT_ID;
	$pg->{ncpus} = scalar @cpus;
	$pg->{util} = 0;
	$pg->{current_rate} = 0;
	$pg->{util_rate_max} = 0;
	$pg->{util_time_running} = 0;
	$pg->{util_time_stopped} = 0;
	$pg->{snaptime} = 0;
	$pg->{generation} = 0;
	$pg->{sh_name} = 'System';
	$pg->{is_leaf} = 0;
	$pg->{cpus} = \@cpus;
	$pg->{parent} = PG_NO_PARENT;

	return ($pg);
}

#
# _pg_all_from_kstats(SNAPSHOT)
# Extract all PG information from kstats
#
sub _pg_all_from_kstats
{
	my $self = shift;
	my $ks = $self->{KSTAT};
	my @all_cpus = @{$self->{CPUS}};

	return unless $ks;

	my $pgtree = ();
	my $pg_info = $ks->{$self->{PG_MODULE}};

	#
	# Walk all PG kstats and copy them to $pgtree->{$id}
	#
	foreach my $id (keys %$pg_info) {
		my $pg_ks = _kstat_get_pg($pg_info, $id,
					  $self->{USE_OLD_KSTATS});
		next unless $pg_ks;

		my $pg = _pg_create_from_kstat($pg_ks, \@all_cpus);

		$pgtree->{$id} = $pg if $pg;
	}

	#
	# OS does not have root PG, so create one.
	#
	if (!$pgtree->{$ROOT_ID}) {
		$pgtree->{$ROOT_ID} = _pg_create_root (@all_cpus);
	}

	#
	# Construct parent-child relationships between PGs
	#

	#
	# Get list of PGs sorted by number of CPUs
	# If two PGs have the same number of CPUs, sort by relationship order.
	#
	my @lineage = sort {
		$a->{ncpus} <=> $b->{ncpus} ||
		_relationship_order($a->{sh_name}) <=>
		_relationship_order($b->{sh_name})
	    } values %$pgtree;

	#
	# For each PG in the lineage discover its parent if it doesn't have one.
	#
	for (my $i = 0; $i < scalar @lineage; $i++) {
		my $pg = $lineage[$i];

		#
		# Ignore PGs which already have parent in kstats
		#
		my $parent = $pg->{parent};
		next if ($parent >= PG_NO_PARENT);

		my $ncpus = $pg->{ncpus};
		my @cpus = @{$pg->{cpus}};

		#
		# Walk the lineage, ignoring any CPUs with the same number of
		# CPUs
		for (my $j = $i + 1; $j < scalar @lineage; $j++) {
			my $pg1 = $lineage[$j];
			my @parent_cpus = @{$pg1->{cpus}};
			if (_is_subset(\@cpus, \@parent_cpus)) {
				$pg->{parent} = $pg1->{id};
				last;
			}
		}
	}

	#
	# Find all top-level PGs and put them under $root
	#
	foreach my $pgid (keys %$pgtree) {
		next if $pgid == $ROOT_ID;
		my $pg = $pgtree->{$pgid};
		$pg->{parent} = $ROOT_ID unless $pg->{parent} >= 0;
	}

	#
	# Now that we know parents, for each parent add all direct children to
	# their parent sets
	#
	foreach my $pg (@lineage) {
		my $parentid = $pg->{parent};
		next unless defined $parentid;

		my $parent = $pgtree->{$parentid};
		push (@{$parent->{children}}, $pg->{id});
	}

	return ($pgtree);
}

#
# Read kstats and initialize PG object
# Collect basic information about cmt_pg
# Add list of children and list of CPUs
# Returns the hash reference indexed by pg id
#
# The _init() function accepts arguments in the form of a hash. The following
# subarguments are supported:
#
#   -cpudata	# Collect per-CPU data from kstats if this is T
#   -tags	# Match PGs to physical relationships if this is T
#   -swload	# Collect software CPU load if this is T

sub _init
{
	my $ks = Sun::Solaris::Kstat->new(strip_strings => 1);
	return unless $ks;

	my %args = @_;
	my $get_cpu_data = $args{-cpudata};
	my $get_tags = $args{-tags};
	my $get_swload = $args{-swload};

	my $self;

	my $use_old_kstat_names = scalar(grep {/^pg_hw_perf/ } keys (%$ks)) == 0;

	my @frequencies;
	$self->{MAX_FREQUENCY} = 0;

	$self->{PG_MODULE} = $use_old_kstat_names ? 'pg' : 'pg_hw_perf';
	$self->{PG_CPU_MODULE} =  $use_old_kstat_names ?
	  'pg_cpu' : 'pg_hw_perf_cpu';
	$self->{USE_OLD_KSTATS} = $use_old_kstat_names;

	$get_cpu_data = 0 unless  scalar(grep {/^$self->{PG_CPU_MODULE}/ }
					 keys (%$ks));

	# Get list of PG-related kstats
	my $pg_keys = $use_old_kstat_names ? 'pg' : 'pg_hw';

	if (scalar(grep { /^$pg_keys/ } keys (%$ks)) == 0) {
		if (exists(&Errno::ENOTSUPP)) {
			$! = &Errno::ENOTSUPP;
		} else {
			$! = 48;
		}
		return;
	}


	#
	# Mapping of cores and chips to CPUs
	#
	my $hw_mapping;

	#
	# Get list of all CPUs
	#
	my $cpu_info = $ks->{cpu_info};

	#
	# @all-cpus is a list of all cpus
	#
	my @all_cpus = keys %$cpu_info;

	#
	# Save list of all CPUs in the snapshot
	#
	$self->{CPUS} = \@all_cpus;

	#
	# Find CPUs for each socket and chip
	# Also while we scan CPU kstats, get maximum frequency of each CPU.
	#
	foreach my $id (@all_cpus) {
		my $ci = $cpu_info->{$id}->{"cpu_info$id"};
		next unless $ci;
		my $core_id = $ci->{core_id};
		my $chip_id = $ci->{chip_id};

		push(@{$hw_mapping->{core}->{$core_id}}, $id)
		  if defined $core_id;
		push(@{$hw_mapping->{chip}->{$chip_id}}, $id)
		  if defined $chip_id;

		# Read CPU frequencies separated by commas
		my $freqs = $ci->{supported_frequencies_Hz};
		my $max_freq = max(split(/:/, $freqs));

		# Calculate maximum frequency for the snapshot.
		$self->{MAX_FREQUENCY} = $max_freq if
		  $self->{MAX_FREQUENCY} < $max_freq;
	}

	$self->{KSTAT} = $ks;

	#
	# Convert kstats to PG tree
	#
	my $pgtree = _pg_all_from_kstats($self);
	$self->{PGTREE} = $pgtree;

	#
	# Find capacity estimate per sharing relationship
	#
	foreach my $pgid (keys %$pgtree) {
		my $pg = $pgtree->{$pgid};
		my $shname = $pg->{sh_name};
		my $max_rate = $pg->{util_rate_max};
		$self->{CAPACITY}->{$shname} = $max_rate if
		  !$self->{CAPACITY}->{$shname} ||
		    $self->{CAPACITY}->{$shname} < $max_rate;
	}

	if ($get_tags) {
		#
		# Walk all PGs and mark all PGs that have corresponding hardware
		# entities (system, chips, cores).
		#
		foreach my $pgid (keys %$pgtree) {
			my $pg = $pgtree->{$pgid};
			my @cpus = @{$pg->{cpus}};
			next unless scalar @cpus > 1;

			if (_set_equal (\@cpus, \@all_cpus)) {
				#
				# PG has all CPUs in the system.
				#
				push (@{$pg->{tags}}, 'system');
			}

			foreach my $name ('core', 'chip') {
				my $hwdata = $hw_mapping->{$name};
				foreach my $id (keys %$hwdata) {
					# CPUs for this entity
					my @hw_cpus = @{$hwdata->{$id}};
					if (_set_equal (\@cpus, \@hw_cpus)) {
						#
						# PG has exactly the same CPUs
						#
						push (@{$pg->{tags}}, $name);
					}
				}
			}
		}
	}

	#
	# Save software load for each CPU
	#
	if ($get_swload) {
		$self->{CPU_LOAD} = _get_sw_cpu_load($ks);
	}

	#
	# Collect per-CPU utilization data if requested
	#
	if ($get_cpu_data) {
		_get_hw_cpu_load($self);
	}

	$self->{GET_CPU_DATA} = $get_cpu_data;

	#
	# Verify that in the end we have the same PG generation for each PG
	#
	if (! _same_generation($self)) {
		$! = &Errno::EAGAIN;
		return;
	}

	return ($self);
}

#
# Verify that topology is the same as at the time snapshot was created
#
sub _same_generation
{
	my $self = shift;
	my $pgtree =  $self->{PGTREE} or return;

	return (0) unless $self;

	my $ks = $self->{KSTAT};
	$ks->update();
	my $pg_info = $ks->{$self->{PG_MODULE}};
	foreach my $id (keys %$pg_info) {
		my $pg = $pgtree->{$id} or next;

		my $pg_ks = _kstat_get_pg($pg_info, $id,
					  $self->{USE_OLD_KSTATS});
		return unless $pg_ks;
		return (0) unless $pg->{generation} == $pg_ks->{generation};
	}
	return (1);
}

#
# Update capacity for both PGs
#
sub _capacity_update
{
	my $c1 = shift;
	my $c2 = shift;

	my $pgtree1 = $c1->{PGTREE};
	my $pgtree2 = $c2->{PGTREE};

	foreach my $pgid (keys %$pgtree1) {
		my $pg1 = $pgtree1->{$pgid};
		my $pg2 = $pgtree2->{$pgid};
		next unless $pg1 && $pg2;
		next unless $pg1->{generation} != $pg2->{generation};
		my $shname1 = $pg1->{sh_name};
		my $shname2 = $pg2->{sh_name};
		next unless $shname1 eq $shname2;
		my $max_rate = max($pg1->{util_rate_max}, $pg2->{util_rate_max});

		my $utilization = abs($pg1->{util} - $pg2->{util});
		my $tdelta = abs($pg1->{snaptime} - $pg2->{snaptime});
		$utilization /= $tdelta if $utilization && $tdelta;
		$max_rate = $utilization if
		  $utilization && $max_rate < $utilization;

		$c1->{CAPACITY}->{$shname1} = $max_rate if
		  !$c1->{CAPACITY}->{$shname1} ||
		    !$c1->{CAPACITY}->{$shname1} < $max_rate;
		$c2->{CAPACITY}->{$shname2} = $max_rate if
		  !$c2->{CAPACITY}->{$shname2} ||
		    !$c2->{CAPACITY}->{$shname2} < $max_rate;
	}
}

#
# Return list of PGs breadth first
#
sub _walk_depth_first
{
	my $p = shift;
	# Nothing to do if list is empty
	return unless scalar (@_);

	return (map { ($_, _walk_depth_first ($p, $p->children($_))) } @_);
}

#
# Return list of PGs breadth first
#
sub _walk_breadth_first
{
	my $p = shift;
	# Nothing to do if list is empty
	return unless scalar (@_);

	return (@_, _walk_breadth_first($p, map { $p->children($_) } @_));
}

#
# Given the kstat reference (already hashed by module name) and PG ID return the
# corresponding kstat.
#
sub _kstat_get_pg
{
	my $mod = shift;
	my $pgid = shift;
	my $use_old_kstats = shift;

	my $id_field = $use_old_kstats ? 'id' : 'pg_id';

	return ($mod->{$pgid}->{hardware}) if $use_old_kstats;

	my @instances = grep { $_->{$id_field} == $pgid }
	  values(%{$mod->{$pgid}});
	return ($instances[0]);
}

######################################################################
# Set routines
#######################################################################
#
# Return T if one list contains all the elements of another list.
# All lists are passed by reference
#
sub _is_subset
{
	my ($left, $right) = @_;
	my %seen;	# Set to 1 for everything in the first list
	# Put the shortest list in $left

	Carp::croak "invalid left argument" unless ref ($left) eq 'ARRAY';
	Carp::croak "invalid right argument" unless ref ($right) eq 'ARRAY';

	# Create a hash indexed by elements in @right with ones as a value.
	map { $seen{$_} = 1 } @$right;

	# Find members of @left not present in @right
	my @extra = grep { !$seen{$_} } @$left;
	return (!scalar(@extra));
}

sub _is_member
{
	my $set = shift;
	my $element = shift;
	my %seen;

	map { $seen{$_} = 1 } @$set;

	return ($seen{$element});
}

#
# Return T if C1 and C2 contain the same elements
#
sub _set_equal
{
	my $c1 = shift;
	my $c2 = shift;

	return 0 unless scalar @$c1 == scalar @$c2;

	return (_is_subset($c1, $c2) && _is_subset($c2, $c1));
}

#
# Return the intersection of two lists passed by reference
# Convert the first list to a hash with seen entries marked as 1-values
# Then grep only elements present in the first list from the second list.
# As a little optimization, use the shorter list to build a hash.
#
sub _set_intersect
{
	my ($left, $right) = @_;
	my %seen;	# Set to 1 for everything in the first list
	# Put the shortest list in $left
	scalar @$left <= scalar @$right or ($right, $left) = ($left, $right);

	# Create a hash indexed by elements in @left with ones as a value.
	map { $seen{$_} = 1 } @$left;
	# Find members of @right present in @left
	return (grep { $seen{$_} } @$right);
}

#
# Expand start-end into the list of values
# Input: string containing a single numeric ID or x-y range
# Output: single value or a list of values
# Ranges with start being more than end are inverted
#
sub _expand
{
	# Skip the first argument if it is the object reference
	shift if ref $@[0] eq 'HASH';

	my $arg = shift;

	return unless defined $arg;

	my @args = split /,/, $arg;

	return map { _expand($_) } @args if scalar @args > 1;

	$arg = shift @args;
	return unless defined $arg;

	if ($arg =~ m/^\d+$/) {
		# single number
		return ($arg);
	} elsif ($arg =~ m/^(\d+)\-(\d+)$/) {
		my ($start, $end) = ($1, $2);	# $start-$end
		# Reverse the interval if start > end
		($start, $end) = ($end, $start) if $start > $end;
		return ($start .. $end);
	} else {
		return $arg;
	}
	return;
}

#
# Consolidate consecutive ids as start-end
# Input: list of ids
# Output: string with space-sepated cpu values with ranges
#   collapsed as x-y
#
sub _collapse
{
	return ('') unless @_;
	my @args = _uniqsort(@_);
	my $start = shift(@args);
	my $result = '';
	my $end = $start;	# Initial range consists of the first element
	foreach my $el (@args) {
		if (!$el =~ /^\d+$/) {
			$result = "$result $el";
			$end = $el;
		} elsif ($el == ($end + 1)) {
			#
			# Got consecutive ID, so extend end of range without
			# printing anything since the range may extend further
			#
			$end = $el;
		} else {
			#
			# Next ID is not consecutive, so print IDs gotten so
			# far.
			#
			if ($end > $start + 1) {	# range
				$result = "$result $start-$end";
			} elsif ($end > $start) {	# different values
				$result = "$result $start $end";
			} else {	# same value
				$result = "$result $start";
			}

			# Try finding consecutive range starting from this ID
			$start = $end = $el;
		}
	}

	# Print last ID(s)
	if (! ($end =~ /^\d+$/)) {
		$result = "$result $end";
	} elsif ($end > $start + 1) {
		$result = "$result $start-$end";
	} elsif ($end > $start) {
		$result = "$result $start $end";
	} else {
		$result = "$result $start";
	}
	# Remove any spaces in the beginning
	$result =~ s/^\s+//;
	return ($result);
}

#
# get relationship order from relationship name.
# return 0 for all unknown names.
#
sub _relationship_order
{
	my $name = shift;
	return ($relationships_order{$name} || 0);
}

#
# Get software load for each CPU from kstats
# Argument: kstat reference
# Returns: reference to the hash with 
# cpu_idle, cpu_user, cpu_sys keys.
#
sub _get_sw_cpu_load
{
	my $ks = shift or return;

	my $loads;
	my $sys_ks = $ks->{cpu};
	foreach my $cpu (keys %$sys_ks) {
		my $sys = $sys_ks->{$cpu}->{sys};
		$loads->{$cpu}->{cpu_idle} = $sys->{cpu_ticks_idle};
		$loads->{$cpu}->{cpu_user} = $sys->{cpu_ticks_user};
		$loads->{$cpu}->{cpu_sys} = $sys->{cpu_ticks_kernel};
	}

	return ($loads);
}

#
# Get software load for each CPU from kstats
# Arguments:
#  pgtree reference
#  kstat reference
#
# Returns: nothing
# Stores CPU load in the $pg->{cpudata} hash for each PG
#
sub _get_hw_cpu_load
{
	my $self = shift;
	my $pgtree = $self->{PGTREE};
	my $ks = $self->{KSTAT};

	my $pg_cpu_ks = $ks->{$self->{PG_CPU_MODULE}};

	foreach my $pgid (keys %$pgtree) {
		my $pg = $pgtree->{$pgid};
		my @cpus = @{$pg->{cpus}};
		my $cpu;
		my $pg_id;
		foreach my $cpu (keys %$pg_cpu_ks) {
			next unless _is_member(\@cpus, $cpu);
			my $cpu_hw_data = $pg_cpu_ks->{$cpu};
			foreach my $hw (keys %$cpu_hw_data) {
				my $cpudata = $cpu_hw_data->{$hw};

				#
				# Only consider information for this PG
				#
				next unless $cpudata->{pg_id} == $pgid;

				$pg->{cpudata}->{$cpu}->{generation} =
				  $cpudata->{generation};
				$pg->{cpudata}->{$cpu}->{util} =
				  $cpudata->{hw_util};
				$pg->{cpudata}->{$cpu}->{util_time_running} =
				  $cpudata->{hw_util_time_running};
				$pg->{cpudata}->{$cpu}->{util_time_stopped} =
				  $cpudata->{hw_util_time_stopped};
				$pg->{cpudata}->{$cpu}->{snaptime} =
				  $cpudata->{snaptime};
			}
		}
	}
}

1;

__END__

#
# The information about PG hierarchy is contained in a object return by the
# new() method.
#
# This module can deal with old PG kstats that have 'pg' and 'pg_cpu' as module
# names as well as new PG kstats which use 'pg_hw_perf' and ''pg_hw_perf_cpu' as
# the module name.
#
# The object contains the following fields:
#
#   CPUS		List of all CPUs present.
#   CAPACITY		Estimate of capacity for each sharing
#   PGTREE		The PG tree. See below for the tree representation.
#
#   PG_MODULE 		Module name for the PG kstats. It is either 'pg' for
#			 old style kstats, or 'pg_hw_perf' for new style kstats.
#
#   MAX_FREQUENCY	Maximum CPU frequency
#   USE_OLD_KSTATS	True if we are dealing with old style kstats
#   KSTAT		The kstat object used to generate this hierarchy.
#
# The PG tree is represented as a hash table indexed by PG ID. Each element of
# the table is the hash reference with the following fields:
#
#   children		Reference to the list of children PG IDs
#   cpus		Reference to the list of cpu IDs in the PG
#   current_rate	Current utilization rate
#   generation		PG generation
#   id			PG id
#   ncpus		number of CPUs in the PG
#   parent		PG parent id, or -1 if there is none.
#   sh_name		Sharing name
#   snaptime		Snapshot time
#   util		Hardware utilization
#   util_rate_max	Maximum utilization rate
#   util_time_running	Time (in nanoseconds) when utilization data is collected
#   util_time_stopped	Time when utilization data is not collected
#
# The fields (with the exception of 'children') are a copy of the data from
# kstats.
#
# The PG hierarchy in the kernel does not have the root PG. We simulate the root
# (System) PG which is the parent of top level PGs in the system. This PG always
# has ID 0.
#