/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Support for determining capacity and utilization of performance relevant
 * hardware components in a computer
 *
 * THEORY
 * ------
 * The capacity and utilization of the performance relevant hardware components
 * is needed to be able to optimize performance while minimizing the amount of
 * power used on a system.  The idea is to use hardware performance counters
 * and potentially other means to determine the capacity and utilization of
 * performance relevant hardware components (eg. execution pipeline, cache,
 * memory, etc.) and attribute the utilization to the responsible CPU and the
 * thread running there.
 *
 * This will help characterize the utilization of performance relevant
 * components and how much is used by each CPU and each thread.  With
 * that data, the utilization can be aggregated to all the CPUs sharing each
 * performance relevant hardware component to calculate the total utilization
 * of each component and compare that with the component's capacity to
 * essentially determine the actual hardware load of the component.  The
 * hardware utilization attributed to each running thread can also be
 * aggregated to determine the total hardware utilization of each component to
 * a workload.
 *
 * Once that is done, one can determine how much of each performance relevant
 * hardware component is needed by a given thread or set of threads (eg. a
 * workload) and size up exactly what hardware is needed by the threads and how
 * much.  With this info, we can better place threads among CPUs to match their
 * exact hardware resource needs and potentially lower or raise the power based
 * on their utilization or pack threads onto the fewest hardware components
 * needed and power off any remaining unused components to minimize power
 * without sacrificing performance.
 *
 * IMPLEMENTATION
 * --------------
 * The code has been designed and implemented to make (un)programming and
 * reading the counters for a given CPU as lightweight and fast as possible.
 * This is very important because we need to read and potentially (un)program
 * the counters very often and in performance sensitive code.  Specifically,
 * the counters may need to be (un)programmed during context switch and/or a
 * cyclic handler when there are more counter events to count than existing
 * counters.
 *
 * Consequently, the code has been split up to allow allocating and
 * initializing everything needed to program and read the counters on a given
 * CPU once and make (un)programming and reading the counters for a given CPU
 * not have to allocate/free memory or grab any locks.  To do this, all the
 * state needed to (un)program and read the counters on a CPU is kept per CPU
 * and is made lock free by forcing any code that reads or manipulates the
 * counters or the state needed to (un)program or read the counters to run on
 * the target CPU and disable preemption while running on the target CPU to
 * protect any critical sections. All counter manipulation on the target CPU is
 * happening either from a cross-call to the target CPU or at the same PIL as
 * used by the cross-call subsystem. This guarantees that counter manipulation
 * is not interrupted by cross-calls from other CPUs.
 *
 * The synchronization has been made lock free or as simple as possible for
 * performance and to avoid getting the locking all tangled up when we interpose
 * on the CPC routines that (un)program the counters to manage the counters
 * between the kernel and user on each CPU.  When the user starts using the
 * counters on a given CPU, the kernel will unprogram the counters that it is
 * using on that CPU just before they are programmed for the user.  Then the
 * kernel will program the counters on a given CPU for its own use when the user
 * stops using them.
 *
 * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
 * enables any probe, it requests to disable and unprogram all counters used for
 * capacity and utilizations. These counters are never re-programmed back until
 * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
 * framework and it re-programs the counters.
 *
 * When a CPU is going offline, its CU counters are unprogrammed and disabled,
 * so that they would not be re-programmed again by some other activity on the
 * CPU that is going offline.
 *
 * The counters are programmed during boot.  However, a flag is available to
 * disable this if necessary (see cu_flag below).  A handler is provided to
 * (un)program the counters during CPU on/offline.  Basic routines are provided
 * to initialize and tear down this module, initialize and tear down any state
 * needed for a given CPU, and (un)program the counters for a given CPU.
 * Lastly, a handler is provided to read the counters and attribute the
 * utilization to the responsible CPU.
 */
#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/ddi.h>
#include <sys/systm.h>
#include <sys/disp.h>
#include <sys/sdt.h>
#include <sys/sunddi.h>
#include <sys/thread.h>
#include <sys/pghw.h>
#include <sys/cmt.h>
#include <sys/policy.h>
#include <sys/x_call.h>
#include <sys/cap_util.h>

#include <sys/archsystm.h>
#include <sys/promif.h>

#if defined(__x86)
#include <sys/xc_levels.h>
#endif


/*
 * Default CPU hardware performance counter flags to use for measuring capacity
 * and utilization
 */
#define	CU_CPC_FLAGS_DEFAULT	\
	(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)

/*
 * Possible Flags for controlling this module.
 */
#define	CU_FLAG_ENABLE		1	/* Enable module */
#define	CU_FLAG_READY		2	/* Ready to setup module */
#define	CU_FLAG_ON		4	/* Module is on */

/*
 * pg_cpu kstats calculate utilization rate and maximum utilization rate for
 * some CPUs. The rate is calculated based on data from two subsequent
 * snapshots. When the time between such two snapshots is too small, the
 * resulting rate may have low accuracy, so we only consider snapshots which
 * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
 * update the rate if the interval is smaller than that.
 *
 * Use one tenth of a second as the minimum interval for utilization rate
 * calculation.
 *
 * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
 * the CU_RATE() macro below to guarantee that we never divide by zero.
 *
 * Rate is the number of events per second. The rate is the number of events
 * divided by time and multiplied by the number of nanoseconds in a second. We
 * do not want time to be too small since it will cause large errors in
 * division.
 *
 * We do not want to multiply two large numbers (the instruction count and
 * NANOSEC) either since it may cause integer overflow. So we divide both the
 * numerator and the denominator by the same value.
 *
 * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
 * above to guarantee that time divided by this value is always non-zero.
 */
#define	CU_RATE(val, time) \
	(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))

#define	CU_SAMPLE_INTERVAL_MIN	(NANOSEC / 10)

#define	CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)

/*
 * When the time between two kstat reads for the same CPU is less than
 * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
 * for the CPU. This helps reduce cross-calls when kstat consumers read data
 * very often or when they read PG utilization data and then CPU utilization
 * data quickly after that.
 */
#define	CU_UPDATE_THRESHOLD (NANOSEC / 10)

/*
 * The IS_HIPIL() macro verifies that the code is executed either from a
 * cross-call or from high-PIL interrupt
 */
#ifdef DEBUG
#define	IS_HIPIL() (getpil() >= XCALL_PIL)
#else
#define	IS_HIPIL()
#endif	/* DEBUG */


typedef void (*cu_cpu_func_t)(uintptr_t, int *);


/*
 * Flags to use for programming CPU hardware performance counters to measure
 * capacity and utilization
 */
int				cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;

/*
 * Initial value used for programming hardware counters
 */
uint64_t			cu_cpc_preset_value = 0;

/*
 * List of CPC event requests for capacity and utilization.
 */
static kcpc_request_list_t	*cu_cpc_reqs = NULL;

/*
 * When a CPU is a member of PG with a sharing relationship that is supported
 * by the capacity/utilization framework, a kstat is created for that CPU and
 * sharing relationship.
 *
 * These kstats are updated one at a time, so we can have a single scratch
 * space to fill the data.
 *
 * CPU counter kstats fields:
 *
 *   cu_cpu_id		CPU ID for this kstat
 *
 *   cu_pg_id		PG ID for this kstat
 *
 *   cu_generation	Generation value that increases whenever any CPU goes
 *			  offline or online. Two kstat snapshots for the same
 *			  CPU may only be compared if they have the same
 *			  generation.
 *
 *   cu_pg_id		PG ID for the relationship described by this kstat
 *
 *   cu_cpu_util	Running value of CPU utilization for the sharing
 *			  relationship
 *
 *   cu_cpu_time_running Total time spent collecting CU data. The time may be
 *			   less than wall time if CU counters were stopped for
 *			   some time.
 *
 *   cu_cpu_time_stopped Total time the CU counters were stopped.
 *
 *   cu_cpu_rate	Utilization rate, expressed in operations per second.
 *
 *   cu_cpu_rate_max	Maximum observed value of utilization rate.
 *
 *   cu_cpu_relationship Name of sharing relationship for the PG in this kstat
 */
struct cu_cpu_kstat {
	kstat_named_t	cu_cpu_id;
	kstat_named_t	cu_pg_id;
	kstat_named_t	cu_generation;
	kstat_named_t	cu_cpu_util;
	kstat_named_t	cu_cpu_time_running;
	kstat_named_t	cu_cpu_time_stopped;
	kstat_named_t	cu_cpu_rate;
	kstat_named_t	cu_cpu_rate_max;
	kstat_named_t	cu_cpu_relationship;
} cu_cpu_kstat = {
	{ "cpu_id",			KSTAT_DATA_UINT32 },
	{ "pg_id",			KSTAT_DATA_INT32 },
	{ "generation",			KSTAT_DATA_UINT32 },
	{ "hw_util",			KSTAT_DATA_UINT64 },
	{ "hw_util_time_running",	KSTAT_DATA_UINT64 },
	{ "hw_util_time_stopped",	KSTAT_DATA_UINT64 },
	{ "hw_util_rate",		KSTAT_DATA_UINT64 },
	{ "hw_util_rate_max",		KSTAT_DATA_UINT64 },
	{ "relationship",		KSTAT_DATA_STRING },
};

/*
 * Flags for controlling this module
 */
uint_t				cu_flags = CU_FLAG_ENABLE;

/*
 * Error return value for cu_init() since it can't return anything to be called
 * from mp_init_tbl[] (:-(
 */
static int			cu_init_error = 0;

hrtime_t			cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;

hrtime_t			cu_update_threshold = CU_UPDATE_THRESHOLD;

static kmutex_t			pg_cpu_kstat_lock;


/*
 * Forward declaration of interface routines
 */
void		cu_disable(void);
void		cu_enable(void);
void		cu_init(void);
void		cu_cpc_program(cpu_t *cp, int *err);
void		cu_cpc_unprogram(cpu_t *cp, int *err);
int		cu_cpu_update(struct cpu *cp, boolean_t move_to);
void		cu_pg_update(pghw_t *pg);


/*
 * Forward declaration of private routines
 */
static int	cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
static void	cu_cpc_program_xcall(uintptr_t arg, int *err);
static int	cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
    int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
static int	cu_cpu_callback(cpu_setup_t what, int id, void *arg);
static void	cu_cpu_disable(cpu_t *cp);
static void	cu_cpu_enable(cpu_t *cp);
static int	cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
static int	cu_cpu_fini(cpu_t *cp);
static void	cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
static int	cu_cpu_kstat_update(kstat_t *ksp, int rw);
static int	cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
static int	cu_cpu_update_stats(cu_cntr_stats_t *stats,
    uint64_t cntr_value);
static void cu_cpu_info_detach_xcall(void);

/*
 * Disable or enable Capacity Utilization counters on all CPUs.
 */
void
cu_disable(void)
{
	cpu_t *cp;

	ASSERT(MUTEX_HELD(&cpu_lock));

	cp = cpu_active;
	do {
		if (!(cp->cpu_flags & CPU_OFFLINE))
			cu_cpu_disable(cp);
	} while ((cp = cp->cpu_next_onln) != cpu_active);
}


void
cu_enable(void)
{
	cpu_t *cp;

	ASSERT(MUTEX_HELD(&cpu_lock));

	cp = cpu_active;
	do {
		if (!(cp->cpu_flags & CPU_OFFLINE))
			cu_cpu_enable(cp);
	} while ((cp = cp->cpu_next_onln) != cpu_active);
}


/*
 * Setup capacity and utilization support
 */
void
cu_init(void)
{
	cpu_t	*cp;

	cu_init_error = 0;
	if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
		cu_init_error = -1;
		return;
	}

	if (kcpc_init() != 0) {
		cu_init_error = -2;
		return;
	}

	/*
	 * Can't measure hardware capacity and utilization without CPU
	 * hardware performance counters
	 */
	if (cpc_ncounters <= 0) {
		cu_init_error = -3;
		return;
	}

	/*
	 * Setup CPC event request queue
	 */
	cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);

	mutex_enter(&cpu_lock);

	/*
	 * Mark flags to say that module is ready to be setup
	 */
	cu_flags |= CU_FLAG_READY;

	cp = cpu_active;
	do {
		/*
		 * Allocate and setup state needed to measure capacity and
		 * utilization
		 */
		if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
			cu_init_error = -5;

		/*
		 * Reset list of counter event requests so its space can be
		 * reused for a different set of requests for next CPU
		 */
		(void) kcpc_reqs_reset(cu_cpc_reqs);

		cp = cp->cpu_next_onln;
	} while (cp != cpu_active);

	/*
	 * Mark flags to say that module is on now and counters are ready to be
	 * programmed on all active CPUs
	 */
	cu_flags |= CU_FLAG_ON;

	/*
	 * Program counters on currently active CPUs
	 */
	cp = cpu_active;
	do {
		if (cu_cpu_run(cp, cu_cpc_program_xcall,
		    (uintptr_t)B_FALSE) != 0)
			cu_init_error = -6;

		cp = cp->cpu_next_onln;
	} while (cp != cpu_active);

	/*
	 * Register callback for CPU state changes to enable and disable
	 * CPC counters as CPUs come on and offline
	 */
	register_cpu_setup_func(cu_cpu_callback, NULL);

	mutex_exit(&cpu_lock);
}


/*
 * Return number of counter events needed to measure capacity and utilization
 * for specified CPU and fill in list of CPC requests with each counter event
 * needed if list where to add CPC requests is given
 *
 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
 *	 everything that has been successfully allocated if any memory
 *	 allocation fails
 */
static int
cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
{
	group_t		*cmt_pgs;
	cu_cntr_info_t	**cntr_info_array;
	cpu_pg_t	*cpu_pgs;
	cu_cpu_info_t	*cu_cpu_info;
	pg_cmt_t	*pg_cmt;
	pghw_t		*pg_hw;
	cu_cntr_stats_t	*stats;
	int		nevents;
	pghw_type_t	pg_hw_type;
	group_iter_t	iter;

	ASSERT(MUTEX_HELD(&cpu_lock));

	/*
	 * There has to be a target CPU for this
	 */
	if (cp == NULL)
		return (-1);

	/*
	 * Return 0 when CPU doesn't belong to any group
	 */
	cpu_pgs = cp->cpu_pg;
	if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
		return (0);

	cmt_pgs = &cpu_pgs->cmt_pgs;
	cu_cpu_info = cp->cpu_cu_info;

	/*
	 * Grab counter statistics and info
	 */
	if (reqs == NULL) {
		stats = NULL;
		cntr_info_array = NULL;
	} else {
		if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
			return (-2);

		stats = cu_cpu_info->cu_cntr_stats;
		cntr_info_array = cu_cpu_info->cu_cntr_info;
	}

	/*
	 * See whether platform (or processor) specific code knows which CPC
	 * events to request, etc. are needed to measure hardware capacity and
	 * utilization on this machine
	 */
	nevents = cu_plat_cpc_init(cp, reqs, nreqs);
	if (nevents >= 0)
		return (nevents);

	/*
	 * Let common code decide which CPC events to request, etc. to measure
	 * capacity and utilization since platform (or processor) specific does
	 * not know....
	 *
	 * Walk CPU's PG lineage and do following:
	 *
	 * - Setup CPC request, counter info, and stats needed for each counter
	 *   event to measure capacity and and utilization for each of CPU's PG
	 *   hardware sharing relationships
	 *
	 * - Create PG CPU kstats to export capacity and utilization for each PG
	 */
	nevents = 0;
	group_iter_init(&iter);
	while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
		cu_cntr_info_t	*cntr_info;
		int		nevents_save;
		int		nstats;

		pg_hw = (pghw_t *)pg_cmt;
		pg_hw_type = pg_hw->pghw_hw;
		nevents_save = nevents;
		nstats = 0;

		switch (pg_hw_type) {
		case PGHW_IPIPE:
			if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
			    KM_NOSLEEP, &nevents) != 0)
				continue;
			nstats = 1;
			break;

		case PGHW_FPU:
			if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
			    KM_NOSLEEP, &nevents) != 0)
				continue;
			nstats = 1;
			break;

		default:
			/*
			 * Don't measure capacity and utilization for this kind
			 * of PG hardware relationship so skip to next PG in
			 * CPU's PG lineage
			 */
			continue;
		}

		cntr_info = cntr_info_array[pg_hw_type];

		/*
		 * Nothing to measure for this hardware sharing relationship
		 */
		if (nevents - nevents_save == 0) {
			if (cntr_info != NULL) {
				kmem_free(cntr_info, sizeof (cu_cntr_info_t));
				cntr_info_array[pg_hw_type] = NULL;
			}
			continue;
		}

		/*
		 * Fill in counter info for this PG hardware relationship
		 */
		if (cntr_info == NULL) {
			cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
			    KM_NOSLEEP);
			if (cntr_info == NULL)
				continue;
			cntr_info_array[pg_hw_type] = cntr_info;
		}
		cntr_info->ci_cpu = cp;
		cntr_info->ci_pg = pg_hw;
		cntr_info->ci_stats = &stats[nevents_save];
		cntr_info->ci_nstats = nstats;

		/*
		 * Create PG CPU kstats for this hardware relationship
		 */
		cu_cpu_kstat_create(pg_hw, cntr_info);
	}

	return (nevents);
}


/*
 * Program counters for capacity and utilization on given CPU
 *
 * If any of the following conditions is true, the counters are not programmed:
 *
 * - CU framework is disabled
 * - The cpu_cu_info field of the cpu structure is NULL
 * - DTrace is active
 * - Counters are programmed already
 * - Counters are disabled (by calls to cu_cpu_disable())
 */
void
cu_cpc_program(cpu_t *cp, int *err)
{
	cu_cpc_ctx_t	*cpu_ctx;
	kcpc_ctx_t	*ctx;
	cu_cpu_info_t	*cu_cpu_info;

	ASSERT(IS_HIPIL());
	/*
	 * Should be running on given CPU. We disable preemption to keep CPU
	 * from disappearing and make sure flags and CPC context don't change
	 * from underneath us
	 */
	kpreempt_disable();
	ASSERT(cp == CPU);

	/*
	 * Module not ready to program counters
	 */
	if (!(cu_flags & CU_FLAG_ON)) {
		*err = -1;
		kpreempt_enable();
		return;
	}

	if (cp == NULL) {
		*err = -2;
		kpreempt_enable();
		return;
	}

	cu_cpu_info = cp->cpu_cu_info;
	if (cu_cpu_info == NULL) {
		*err = -3;
		kpreempt_enable();
		return;
	}

	/*
	 * If DTrace CPC is active or counters turned on already or are
	 * disabled, just return.
	 */
	if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
	    cu_cpu_info->cu_disabled) {
		*err = 1;
		kpreempt_enable();
		return;
	}

	if ((CPU->cpu_cpc_ctx != NULL) &&
	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
		*err = -4;
		kpreempt_enable();
		return;
	}

	/*
	 * Get CPU's CPC context needed for capacity and utilization
	 */
	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
	ASSERT(cpu_ctx != NULL);
	ASSERT(cpu_ctx->nctx >= 0);

	ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
	ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
	    cpu_ctx->ctx_ptr_array_sz <= 0) {
		*err = -5;
		kpreempt_enable();
		return;
	}

	/*
	 * Increment index in CPU's CPC context info to point at next context
	 * to program
	 *
	 * NOTE: Do this now instead of after programming counters to ensure
	 *	 that index will always point at *current* context so we will
	 *	 always be able to unprogram *current* context if necessary
	 */
	cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;

	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];

	/*
	 * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
	 * context before programming counters
	 *
	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
	 * kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to
	 * invalidate all CPC contexts before they take over all the counters.
	 *
	 * This isn't necessary since these flags are only used for thread bound
	 * CPC contexts not CPU bound CPC contexts like ones used for capacity
	 * and utilization.
	 *
	 * There is no need to protect the flag update since no one is using
	 * this context now.
	 */
	ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);

	/*
	 * Program counters on this CPU
	 */
	kcpc_program(ctx, B_FALSE, B_FALSE);

	cp->cpu_cpc_ctx = ctx;

	/*
	 * Set state in CPU structure to say that CPU's counters are programmed
	 * for capacity and utilization now and that they are transitioning from
	 * off to on state. This will cause cu_cpu_update to update stop times
	 * for all programmed counters.
	 */
	cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;

	/*
	 * Update counter statistics
	 */
	(void) cu_cpu_update(cp, B_FALSE);

	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;

	*err = 0;
	kpreempt_enable();
}


/*
 * Cross call wrapper routine for cu_cpc_program()
 *
 * Checks to make sure that counters on CPU aren't being used by someone else
 * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
 * nobody else is using the counters to catch and prevent any broken code.
 * Also, this check needs to happen on the target CPU since the CPU's CPC
 * context can only be changed while running on the CPU.
 *
 * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
 * no valid thread bound cpc context. This is important to check to prevent
 * re-programming thread counters with CU counters when CPU is coming on-line.
 */
static void
cu_cpc_program_xcall(uintptr_t arg, int *err)
{
	boolean_t	avoid_thread_context = (boolean_t)arg;

	kpreempt_disable();

	if (CPU->cpu_cpc_ctx != NULL &&
	    !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
		*err = -100;
		kpreempt_enable();
		return;
	}

	if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
	    !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
		*err = -200;
		kpreempt_enable();
		return;
	}

	cu_cpc_program(CPU, err);
	kpreempt_enable();
}


/*
 * Unprogram counters for capacity and utilization on given CPU
 * This function should be always executed on the target CPU at high PIL
 */
void
cu_cpc_unprogram(cpu_t *cp, int *err)
{
	cu_cpc_ctx_t	*cpu_ctx;
	kcpc_ctx_t	*ctx;
	cu_cpu_info_t	*cu_cpu_info;

	ASSERT(IS_HIPIL());
	/*
	 * Should be running on given CPU with preemption disabled to keep CPU
	 * from disappearing and make sure flags and CPC context don't change
	 * from underneath us
	 */
	kpreempt_disable();
	ASSERT(cp == CPU);

	/*
	 * Module not on
	 */
	if (!(cu_flags & CU_FLAG_ON)) {
		*err = -1;
		kpreempt_enable();
		return;
	}

	cu_cpu_info = cp->cpu_cu_info;
	if (cu_cpu_info == NULL) {
		*err = -3;
		kpreempt_enable();
		return;
	}

	/*
	 * Counters turned off already
	 */
	if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
		*err = 1;
		kpreempt_enable();
		return;
	}

	/*
	 * Update counter statistics
	 */
	(void) cu_cpu_update(cp, B_FALSE);

	/*
	 * Get CPU's CPC context needed for capacity and utilization
	 */
	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
	if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
	    cpu_ctx->ctx_ptr_array_sz <= 0) {
		*err = -5;
		kpreempt_enable();
		return;
	}
	ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];

	/*
	 * CPU's CPC context should be current capacity and utilization CPC
	 * context
	 */
	ASSERT(cp->cpu_cpc_ctx == ctx);
	if (cp->cpu_cpc_ctx != ctx) {
		*err = -6;
		kpreempt_enable();
		return;
	}

	/*
	 * Unprogram counters on CPU.
	 */
	kcpc_unprogram(ctx, B_FALSE);

	ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);

	/*
	 * Unset state in CPU structure saying that CPU's counters are
	 * programmed
	 */
	cp->cpu_cpc_ctx = NULL;
	cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;

	*err = 0;
	kpreempt_enable();
}


/*
 * Add given counter event to list of CPC requests
 */
static int
cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
    cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
{
	int	n;
	int	retval;
	uint_t  flags;

	/*
	 * Return error when no counter event specified, counter event not
	 * supported by CPC's PCBE, or number of events not given
	 */
	if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
	    nevents == NULL)
		return (-1);

	n = *nevents;

	/*
	 * Only count number of counter events needed if list
	 * where to add CPC requests not given
	 */
	if (reqs == NULL) {
		n++;
		*nevents = n;
		return (-3);
	}

	/*
	 * Return error when stats not given or not enough room on list of CPC
	 * requests for more counter events
	 */
	if (stats == NULL || (nreqs <= 0 && n >= nreqs))
		return (-4);

	/*
	 * Use flags in cu_cpc_flags to program counters and enable overflow
	 * interrupts/traps (unless PCBE can't handle overflow interrupts) so
	 * PCBE can catch counters before they wrap to hopefully give us an
	 * accurate (64-bit) virtualized counter
	 */
	flags = cu_cpc_flags;
	if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
		flags &= ~CPC_OVF_NOTIFY_EMT;

	/*
	 * Add CPC request to list
	 */
	retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
	    flags, 0, NULL, &stats[n], kmem_flags);

	if (retval != 0)
		return (-5);

	n++;
	*nevents = n;
	return (0);
}

static void
cu_cpu_info_detach_xcall(void)
{
	ASSERT(IS_HIPIL());

	CPU->cpu_cu_info = NULL;
}


/*
 * Enable or disable collection of capacity/utilization data for a current CPU.
 * Counters are enabled if 'on' argument is True and disabled if it is False.
 * This function should be always executed at high PIL
 */
static void
cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
{
	cpu_t		*cp = (cpu_t *)arg1;
	boolean_t	on = (boolean_t)arg2;
	int		error;
	cu_cpu_info_t	*cu_cpu_info;

	ASSERT(IS_HIPIL());
	kpreempt_disable();
	ASSERT(cp == CPU);

	if (!(cu_flags & CU_FLAG_ON)) {
		kpreempt_enable();
		return;
	}

	cu_cpu_info = cp->cpu_cu_info;
	if (cu_cpu_info == NULL) {
		kpreempt_enable();
		return;
	}

	ASSERT(!cu_cpu_info->cu_disabled ||
	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));

	if (on) {
		/*
		 * Decrement the cu_disabled counter.
		 * Once it drops to zero, call cu_cpc_program.
		 */
		if (cu_cpu_info->cu_disabled > 0)
			cu_cpu_info->cu_disabled--;
		if (cu_cpu_info->cu_disabled == 0)
			cu_cpc_program(CPU, &error);
	} else if (cu_cpu_info->cu_disabled++ == 0) {
		/*
		 * This is the first attempt to disable CU, so turn it off
		 */
		cu_cpc_unprogram(cp, &error);
		ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
	}

	kpreempt_enable();
}


/*
 * Callback for changes in CPU states
 * Used to enable or disable hardware performance counters on CPUs that are
 * turned on or off
 *
 * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
 * We have to use thread_affinity_set to hop to the right CPU because these
 * routines expect cpu_lock held, so we can't cross-call other CPUs while
 * holding CPU lock.
 */
static int
/* LINTED E_FUNC_ARG_UNUSED */
cu_cpu_callback(cpu_setup_t what, int id, void *arg)
{
	cpu_t	*cp;
	int	retval = 0;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (!(cu_flags & CU_FLAG_ON))
		return (-1);

	cp = cpu_get(id);
	if (cp == NULL)
		return (-2);

	switch (what) {
	case CPU_ON:
		/*
		 * Setup counters on CPU being turned on
		 */
		retval = cu_cpu_init(cp, cu_cpc_reqs);

		/*
		 * Reset list of counter event requests so its space can be
		 * reused for a different set of requests for next CPU
		 */
		(void) kcpc_reqs_reset(cu_cpc_reqs);
		break;
	case CPU_INTR_ON:
		/*
		 * Setup counters on CPU being turned on.
		 */
		retval = cu_cpu_run(cp, cu_cpc_program_xcall,
		    (uintptr_t)B_TRUE);
		break;
	case CPU_OFF:
		/*
		 * Disable counters on CPU being turned off. Counters will not
		 * be re-enabled on this CPU until it comes back online.
		 */
		cu_cpu_disable(cp);
		ASSERT(!CU_CPC_ON(cp));
		retval = cu_cpu_fini(cp);
		break;
	default:
		break;
	}
	return (retval);
}


/*
 * Disable or enable Capacity Utilization counters on a given CPU. This function
 * can be called from any CPU to disable counters on the given CPU.
 */
static void
cu_cpu_disable(cpu_t *cp)
{
	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
}


static void
cu_cpu_enable(cpu_t *cp)
{
	cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
}


/*
 * Setup capacity and utilization support for given CPU
 *
 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
 *	 everything that has been successfully allocated including cpu_cu_info
 *	if any memory allocation fails
 */
static int
cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
{
	kcpc_ctx_t	**ctx_ptr_array;
	size_t		ctx_ptr_array_sz;
	cu_cpc_ctx_t	*cpu_ctx;
	cu_cpu_info_t	*cu_cpu_info;
	int		n;

	/*
	 * cpu_lock should be held and protect against CPU going away and races
	 * with cu_{init,fini,cpu_fini}()
	 */
	ASSERT(MUTEX_HELD(&cpu_lock));

	/*
	 * Return if not ready to setup counters yet
	 */
	if (!(cu_flags & CU_FLAG_READY))
		return (-1);

	if (cp->cpu_cu_info == NULL) {
		cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
		    KM_NOSLEEP);
		if (cp->cpu_cu_info == NULL)
			return (-2);
	}

	/*
	 * Get capacity and utilization CPC context for CPU and check to see
	 * whether it has been setup already
	 */
	cu_cpu_info = cp->cpu_cu_info;
	cu_cpu_info->cu_cpu = cp;
	cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;

	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
	if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
	    cpu_ctx->ctx_ptr_array_sz > 0) {
		return (1);
	}

	/*
	 * Should have no contexts since it hasn't been setup already
	 */
	ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
	    cpu_ctx->ctx_ptr_array_sz == 0);

	/*
	 * Determine how many CPC events needed to measure capacity and
	 * utilization for this CPU, allocate space for counter statistics for
	 * each event, and fill in list of CPC event requests with corresponding
	 * counter stats for each request to make attributing counter data
	 * easier later....
	 */
	n = cu_cpc_init(cp, NULL, 0);
	if (n <= 0) {
		(void) cu_cpu_fini(cp);
		return (-3);
	}

	cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
	    KM_NOSLEEP);
	if (cu_cpu_info->cu_cntr_stats == NULL) {
		(void) cu_cpu_fini(cp);
		return (-4);
	}

	cu_cpu_info->cu_ncntr_stats = n;

	n = cu_cpc_init(cp, reqs, n);
	if (n <= 0) {
		(void) cu_cpu_fini(cp);
		return (-5);
	}

	/*
	 * Create CPC context with given requests
	 */
	ctx_ptr_array = NULL;
	ctx_ptr_array_sz = 0;
	n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
	    &ctx_ptr_array_sz);
	if (n <= 0) {
		(void) cu_cpu_fini(cp);
		return (-6);
	}

	/*
	 * Should have contexts
	 */
	ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
	if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
		(void) cu_cpu_fini(cp);
		return (-7);
	}

	/*
	 * Fill in CPC context info for CPU needed for capacity and utilization
	 */
	cpu_ctx->cur_index = 0;
	cpu_ctx->nctx = n;
	cpu_ctx->ctx_ptr_array = ctx_ptr_array;
	cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
	return (0);
}

/*
 * Tear down capacity and utilization support for given CPU
 */
static int
cu_cpu_fini(cpu_t *cp)
{
	kcpc_ctx_t	*ctx;
	cu_cpc_ctx_t	*cpu_ctx;
	cu_cpu_info_t	*cu_cpu_info;
	int		i;
	pghw_type_t	pg_hw_type;

	/*
	 * cpu_lock should be held and protect against CPU going away and races
	 * with cu_{init,fini,cpu_init}()
	 */
	ASSERT(MUTEX_HELD(&cpu_lock));

	/*
	 * Have to at least be ready to setup counters to have allocated
	 * anything that needs to be deallocated now
	 */
	if (!(cu_flags & CU_FLAG_READY))
		return (-1);

	/*
	 * Nothing to do if CPU's capacity and utilization info doesn't exist
	 */
	cu_cpu_info = cp->cpu_cu_info;
	if (cu_cpu_info == NULL)
		return (1);

	/*
	 * Tear down any existing kstats and counter info for each hardware
	 * sharing relationship
	 */
	for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
	    pg_hw_type++) {
		cu_cntr_info_t	*cntr_info;

		cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
		if (cntr_info == NULL)
			continue;

		if (cntr_info->ci_kstat != NULL) {
			kstat_delete(cntr_info->ci_kstat);
			cntr_info->ci_kstat = NULL;
		}
		kmem_free(cntr_info, sizeof (cu_cntr_info_t));
	}

	/*
	 * Free counter statistics for CPU
	 */
	ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
	    cu_cpu_info->cu_ncntr_stats > 0);
	if (cu_cpu_info->cu_cntr_stats != NULL &&
	    cu_cpu_info->cu_ncntr_stats > 0) {
		kmem_free(cu_cpu_info->cu_cntr_stats,
		    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
		cu_cpu_info->cu_cntr_stats = NULL;
		cu_cpu_info->cu_ncntr_stats = 0;
	}

	/*
	 * Get capacity and utilization CPC contexts for given CPU and check to
	 * see whether they have been freed already
	 */
	cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
	if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
	    cpu_ctx->ctx_ptr_array_sz > 0) {
		/*
		 * Free CPC contexts for given CPU
		 */
		for (i = 0; i < cpu_ctx->nctx; i++) {
			ctx = cpu_ctx->ctx_ptr_array[i];
			if (ctx == NULL)
				continue;
			kcpc_free_cpu(ctx);
		}

		/*
		 * Free CPC context pointer array
		 */
		kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);

		/*
		 * Zero CPC info for CPU
		 */
		bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
	}

	/*
	 * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
	 * that no one is going to access the cpu_cu_info whicch we are going to
	 * free.
	 */
	if (cpu_is_online(cp))
		cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
	else
		cp->cpu_cu_info = NULL;

	/*
	 * Free CPU's capacity and utilization info
	 */
	kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));

	return (0);
}

/*
 * Create capacity & utilization kstats for given PG CPU hardware sharing
 * relationship
 */
static void
cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
{
	kstat_t		*ks;
	char		*sharing = pghw_type_string(pg->pghw_hw);
	char		name[KSTAT_STRLEN + 1];

	/*
	 * Just return when no counter info or CPU
	 */
	if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
		return;

	/*
	 * Canonify PG name to conform to kstat name rules
	 */
	(void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1);
	strident_canon(name, TASKQ_NAMELEN + 1);

	if ((ks = kstat_create_zone("pg_hw_perf_cpu",
	    cntr_info->ci_cpu->cpu_id,
	    name, "processor_group", KSTAT_TYPE_NAMED,
	    sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
		return;

	ks->ks_lock = &pg_cpu_kstat_lock;
	ks->ks_data = &cu_cpu_kstat;
	ks->ks_update = cu_cpu_kstat_update;
	ks->ks_data_size += strlen(sharing) + 1;

	ks->ks_private = cntr_info;
	cntr_info->ci_kstat = ks;
	kstat_install(cntr_info->ci_kstat);
}


/*
 * Propagate values from CPU capacity & utilization stats to kstats
 */
static int
cu_cpu_kstat_update(kstat_t *ksp, int rw)
{
	cpu_t		*cp;
	cu_cntr_info_t	*cntr_info = ksp->ks_private;
	struct cu_cpu_kstat	*kstat = &cu_cpu_kstat;
	pghw_t		*pg;
	cu_cntr_stats_t	*stats;

	if (rw == KSTAT_WRITE)
		return (EACCES);

	cp = cntr_info->ci_cpu;
	pg = cntr_info->ci_pg;
	kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
	kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id;

	/*
	 * The caller should have priv_cpc_cpu privilege to get utilization
	 * data. Callers who do not have the privilege will see zeroes as the
	 * values.
	 */
	if (secpolicy_cpc_cpu(crgetcred()) != 0) {
		kstat->cu_generation.value.ui32 = cp->cpu_generation;
		kstat_named_setstr(&kstat->cu_cpu_relationship,
		    pghw_type_string(pg->pghw_hw));

		kstat->cu_cpu_util.value.ui64 = 0;
		kstat->cu_cpu_rate.value.ui64 = 0;
		kstat->cu_cpu_rate_max.value.ui64 = 0;
		kstat->cu_cpu_time_running.value.ui64 = 0;
		kstat->cu_cpu_time_stopped.value.ui64 = 0;

		return (0);
	}

	kpreempt_disable();

	/*
	 * Update capacity and utilization statistics needed for CPU's PG (CPU)
	 * kstats
	 */

	(void) cu_cpu_update(cp, B_TRUE);

	stats = cntr_info->ci_stats;
	kstat->cu_generation.value.ui32 = cp->cpu_generation;
	kstat_named_setstr(&kstat->cu_cpu_relationship,
	    pghw_type_string(pg->pghw_hw));

	kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
	kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
	kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
	kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
	kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;

	/*
	 * Counters are stopped now, so the cs_time_stopped was last
	 * updated at cs_time_start time. Add the time passed since then
	 * to the stopped time.
	 */
	if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
		kstat->cu_cpu_time_stopped.value.ui64 +=
		    gethrtime() - stats->cs_time_start;

	kpreempt_enable();

	return (0);
}

/*
 * Run specified function with specified argument on a given CPU and return
 * whatever the function returns
 */
static int
cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
{
	int error = 0;

	/*
	 * cpu_call() will call func on the CPU specified with given argument
	 * and return func's return value in last argument
	 */
	cpu_call(cp, (cpu_call_func_t)(uintptr_t)func, arg, (uintptr_t)&error);
	return (error);
}


/*
 * Update counter statistics on a given CPU.
 *
 * If move_to argument is True, execute the function on the CPU specified
 * Otherwise, assume that it is already runninng on the right CPU
 *
 * If move_to is specified, the caller should hold cpu_lock or have preemption
 * disabled. Otherwise it is up to the caller to guarantee that things do not
 * change in the process.
 */
int
cu_cpu_update(struct cpu *cp, boolean_t move_to)
{
	int	retval;
	cu_cpu_info_t	*cu_cpu_info = cp->cpu_cu_info;
	hrtime_t	time_snap;

	ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);

	/*
	 * Nothing to do if counters are not programmed
	 */
	if (!(cu_flags & CU_FLAG_ON) ||
	    (cu_cpu_info == NULL) ||
	    !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
		return (0);

	/*
	 * Don't update CPU statistics if it was updated recently
	 * and provide old results instead
	 */
	time_snap = gethrtime();
	if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
		DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
		return (0);
	}

	cu_cpu_info->cu_sample_time = time_snap;

	/*
	 * CPC counter should be read on the CPU that is running the counter. We
	 * either have to move ourselves to the target CPU or insure that we
	 * already run there.
	 *
	 * We use cross-call to the target CPU to execute kcpc_read() and
	 * cu_cpu_update_stats() there.
	 */
	retval = 0;
	if (move_to)
		(void) cu_cpu_run(cp, (cu_cpu_func_t)(uintptr_t)kcpc_read,
		    (uintptr_t)cu_cpu_update_stats);
	else {
		retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
		/*
		 * Offset negative return value by -10 so we can distinguish it
		 * from error return values of this routine vs kcpc_read()
		 */
		if (retval < 0)
			retval -= 10;
	}

	return (retval);
}


/*
 * Update CPU counter statistics for current CPU.
 * This function may be called from a cross-call
 */
static int
cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
{
	cu_cpu_info_t	*cu_cpu_info = CPU->cpu_cu_info;
	uint_t		flags;
	uint64_t	delta;
	hrtime_t	time_delta;
	hrtime_t	time_snap;

	if (stats == NULL)
		return (-1);

	/*
	 * Nothing to do if counters are not programmed. This should not happen,
	 * but we check just in case.
	 */
	ASSERT(cu_flags & CU_FLAG_ON);
	ASSERT(cu_cpu_info != NULL);
	if (!(cu_flags & CU_FLAG_ON) ||
	    (cu_cpu_info == NULL))
		return (-2);

	flags = cu_cpu_info->cu_flag;
	ASSERT(flags & CU_CPU_CNTRS_ON);
	if (!(flags & CU_CPU_CNTRS_ON))
		return (-2);

	/*
	 * Take snapshot of high resolution timer
	 */
	time_snap = gethrtime();

	/*
	 * CU counters have just been programmed. We cannot assume that the new
	 * cntr_value continues from where we left off, so use the cntr_value as
	 * the new initial value.
	 */
	if (flags & CU_CPU_CNTRS_OFF_ON)
		stats->cs_value_start = cntr_value;

	/*
	 * Calculate delta in counter values between start of sampling period
	 * and now
	 */
	delta = cntr_value - stats->cs_value_start;

	/*
	 * Calculate time between start of sampling period and now
	 */
	time_delta = stats->cs_time_start ?
	    time_snap - stats->cs_time_start :
	    0;
	stats->cs_time_start = time_snap;
	stats->cs_value_start = cntr_value;

	if (time_delta > 0) { /* wrap shouldn't happen */
		/*
		 * Update either running or stopped time based on the transition
		 * state
		 */
		if (flags & CU_CPU_CNTRS_OFF_ON)
			stats->cs_time_stopped += time_delta;
		else
			stats->cs_time_running += time_delta;
	}

	/*
	 * Update rest of counter statistics if counter value didn't wrap
	 */
	if (delta > 0) {
		/*
		 * Update utilization rate if the interval between samples is
		 * sufficient.
		 */
		ASSERT(cu_sample_interval_min > CU_SCALE);
		if (time_delta > cu_sample_interval_min)
			stats->cs_rate = CU_RATE(delta, time_delta);
		if (stats->cs_rate_max < stats->cs_rate)
			stats->cs_rate_max = stats->cs_rate;

		stats->cs_value_last = delta;
		stats->cs_value_total += delta;
	}

	return (0);
}

/*
 * Update CMT PG utilization data.
 *
 * This routine computes the running total utilization and times for the
 * specified PG by adding up the total utilization and counter running and
 * stopped times of all CPUs in the PG and calculates the utilization rate and
 * maximum rate for all CPUs in the PG.
 */
void
cu_pg_update(pghw_t *pg)
{
	pg_cpu_itr_t	cpu_iter;
	pghw_type_t	pg_hwtype;
	cpu_t		*cpu;
	pghw_util_t	*hw_util = &pg->pghw_stats;
	uint64_t	old_utilization = hw_util->pghw_util;
	hrtime_t	now;
	hrtime_t	time_delta;
	uint64_t	utilization_delta;

	ASSERT(MUTEX_HELD(&cpu_lock));

	now = gethrtime();

	pg_hwtype = pg->pghw_hw;

	/*
	 * Initialize running total utilization and times for PG to 0
	 */
	hw_util->pghw_util = 0;
	hw_util->pghw_time_running = 0;
	hw_util->pghw_time_stopped = 0;

	/*
	 * Iterate over all CPUs in the PG and aggregate utilization, running
	 * time and stopped time.
	 */
	PG_CPU_ITR_INIT(pg, cpu_iter);
	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
		cu_cpu_info_t	*cu_cpu_info = cpu->cpu_cu_info;
		cu_cntr_info_t	*cntr_info;
		cu_cntr_stats_t	*stats;

		if (cu_cpu_info == NULL)
			continue;

		/*
		 * Update utilization data for the CPU and then
		 * aggregate per CPU running totals for PG
		 */
		(void) cu_cpu_update(cpu, B_TRUE);
		cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];

		if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
			continue;

		hw_util->pghw_util += stats->cs_value_total;
		hw_util->pghw_time_running += stats->cs_time_running;
		hw_util->pghw_time_stopped += stats->cs_time_stopped;

		/*
		 * If counters are stopped now, the pg_time_stopped was last
		 * updated at cs_time_start time. Add the time passed since then
		 * to the stopped time.
		 */
		if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
			hw_util->pghw_time_stopped +=
			    now - stats->cs_time_start;
	}

	/*
	 * Compute per PG instruction rate and maximum rate
	 */
	time_delta = now - hw_util->pghw_time_stamp;
	hw_util->pghw_time_stamp = now;

	if (old_utilization == 0)
		return;

	/*
	 * Calculate change in utilization over sampling period and set this to
	 * 0 if the delta would be 0 or negative which may happen if any CPUs go
	 * offline during the sampling period
	 */
	if (hw_util->pghw_util > old_utilization)
		utilization_delta = hw_util->pghw_util - old_utilization;
	else
		utilization_delta = 0;

	/*
	 * Update utilization rate if the interval between samples is
	 * sufficient.
	 */
	ASSERT(cu_sample_interval_min > CU_SCALE);
	if (time_delta > CU_SAMPLE_INTERVAL_MIN)
		hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);

	/*
	 * Update the maximum observed rate
	 */
	if (hw_util->pghw_rate_max < hw_util->pghw_rate)
		hw_util->pghw_rate_max = hw_util->pghw_rate;
}