1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25#include <alloca.h>
26#include <assert.h>
27#include <dirent.h>
28#include <dlfcn.h>
29#include <door.h>
30#include <errno.h>
31#include <exacct.h>
32#include <ctype.h>
33#include <fcntl.h>
34#include <kstat.h>
35#include <libcontract.h>
36#include <libintl.h>
37#include <libscf.h>
38#include <zonestat.h>
39#include <zonestat_impl.h>
40#include <limits.h>
41#include <pool.h>
42#include <procfs.h>
43#include <rctl.h>
44#include <thread.h>
45#include <signal.h>
46#include <stdarg.h>
47#include <stddef.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <strings.h>
51#include <synch.h>
52#include <sys/acctctl.h>
53#include <sys/contract/process.h>
54#include <sys/ctfs.h>
55#include <sys/fork.h>
56#include <sys/param.h>
57#include <sys/priocntl.h>
58#include <sys/fxpriocntl.h>
59#include <sys/processor.h>
60#include <sys/pset.h>
61#include <sys/socket.h>
62#include <sys/stat.h>
63#include <sys/statvfs.h>
64#include <sys/swap.h>
65#include <sys/systeminfo.h>
66#include <thread.h>
67#include <sys/list.h>
68#include <sys/time.h>
69#include <sys/types.h>
70#include <sys/vm_usage.h>
71#include <sys/wait.h>
72#include <sys/zone.h>
73#include <time.h>
74#include <ucred.h>
75#include <unistd.h>
76#include <vm/anon.h>
77#include <zone.h>
78#include <zonestat.h>
79
80#define	MAX_PSET_NAME	1024	/* Taken from PV_NAME_MAX_LEN */
81#define	ZSD_PSET_UNLIMITED	UINT16_MAX
82#define	ZONESTAT_EXACCT_FILE	"/var/adm/exacct/zonestat-process"
83
84/*
85 * zonestatd implements gathering cpu and memory utilization data for
86 * running zones.  It has these components:
87 *
88 * zsd_server:
89 *	Door server to respond to client connections.  Each client
90 *	will connect using libzonestat.so, which will open and
91 *	call /var/tmp/.zonestat_door.  Each connecting client is given
92 *	a file descriptor to the stat server.
93 *
94 *	The zsd_server also responds to zoneadmd, which reports when a
95 *	new zone is booted.  This is used to fattach the zsd_server door
96 *	into the new zone.
97 *
98 * zsd_stat_server:
99 *	Receives client requests for the current utilization data.  Each
100 *	client request will cause zonestatd to update the current utilization
101 *	data by kicking the stat_thread.
102 *
103 *	If the client is in a non-global zone, the utilization data will
104 *	be filtered to only show the given zone.  The usage by all other zones
105 *	will be added to the system utilization.
106 *
107 * stat_thread:
108 *	The stat thread implements querying the system to determine the
109 *	current utilization data for each running zone.  This includes
110 *	inspecting the system's processor set configuration, as well as details
111 *	of each zone, such as their configured limits, and which processor
112 *	sets they are running in.
113 *
114 *	The stat_thread will only update memory utilization data as often as
115 *	the configured config/sample_interval on the zones-monitoring service.
116 */
117
118/*
119 * The private vmusage structure unfortunately uses size_t types, and assumes
120 * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
121 * system call is contracted, and zonestatd is 32 bit, the following structures
122 * are used to interact with a 32bit or 64 bit kernel.
123 */
124typedef struct zsd_vmusage32 {
125	id_t vmu_zoneid;
126	uint_t vmu_type;
127	id_t vmu_id;
128
129	uint32_t vmu_rss_all;
130	uint32_t vmu_rss_private;
131	uint32_t vmu_rss_shared;
132	uint32_t vmu_swap_all;
133	uint32_t vmu_swap_private;
134	uint32_t vmu_swap_shared;
135} zsd_vmusage32_t;
136
137typedef struct zsd_vmusage64 {
138	id_t vmu_zoneid;
139	uint_t vmu_type;
140	id_t vmu_id;
141	/*
142	 * An amd64 kernel will align the following uint64_t members, but a
143	 * 32bit i386 process will not without help.
144	 */
145	int vmu_align_next_members_on_8_bytes;
146	uint64_t vmu_rss_all;
147	uint64_t vmu_rss_private;
148	uint64_t vmu_rss_shared;
149	uint64_t vmu_swap_all;
150	uint64_t vmu_swap_private;
151	uint64_t vmu_swap_shared;
152} zsd_vmusage64_t;
153
154struct zsd_zone;
155
156/* Used to store a zone's usage of a pset */
157typedef struct zsd_pset_usage {
158	struct zsd_zone	*zsu_zone;
159	struct zsd_pset	*zsu_pset;
160
161	list_node_t	zsu_next;
162
163	zoneid_t	zsu_zoneid;
164	boolean_t	zsu_found;	/* zone bound at end of interval */
165	boolean_t	zsu_active;	/* zone was bound during interval */
166	boolean_t	zsu_new;	/* zone newly bound in this interval */
167	boolean_t	zsu_deleted;	/* zone was unbound in this interval */
168	boolean_t	zsu_empty;	/* no procs in pset in this interval */
169	time_t		zsu_start;	/* time when zone was found in pset */
170	hrtime_t	zsu_hrstart;	/* time when zone  was found in pset */
171	uint64_t	zsu_cpu_shares;
172	uint_t		zsu_scheds;	/* schedulers found in this pass */
173	timestruc_t	zsu_cpu_usage;	/* cpu time used */
174} zsd_pset_usage_t;
175
176/* Used to store a pset's utilization */
177typedef struct zsd_pset {
178	psetid_t	zsp_id;
179	list_node_t	zsp_next;
180	char		zsp_name[ZS_PSETNAME_MAX];
181
182	uint_t		zsp_cputype;	/* default, dedicated or shared */
183	boolean_t	zsp_found;	/* pset found at end of interval */
184	boolean_t	zsp_new;	/* pset new in this interval */
185	boolean_t	zsp_deleted;	/* pset deleted in this interval */
186	boolean_t	zsp_active;	/* pset existed during interval */
187	boolean_t	zsp_empty;	/* no processes in pset */
188	time_t		zsp_start;
189	hrtime_t	zsp_hrstart;
190
191	uint64_t	zsp_online;	/* online cpus in interval */
192	uint64_t	zsp_size;	/* size in this interval */
193	uint64_t	zsp_min;	/* configured min in this interval */
194	uint64_t	zsp_max;	/* configured max in this interval */
195	int64_t		zsp_importance;	/* configured max in this interval */
196
197	uint_t		zsp_scheds;	/* scheds of processes found in pset */
198	uint64_t	zsp_cpu_shares;	/* total shares in this interval */
199
200	timestruc_t	zsp_total_time;
201	timestruc_t	zsp_usage_kern;
202	timestruc_t	zsp_usage_zones;
203
204	/* Individual zone usages of pset */
205	list_t		zsp_usage_list;
206	int		zsp_nusage;
207
208	/* Summed kstat values from individual cpus in pset */
209	timestruc_t	zsp_idle;
210	timestruc_t	zsp_intr;
211	timestruc_t	zsp_kern;
212	timestruc_t	zsp_user;
213
214} zsd_pset_t;
215
216/* Used to track an individual cpu's utilization as reported by kstats */
217typedef struct zsd_cpu {
218	processorid_t	zsc_id;
219	list_node_t	zsc_next;
220	psetid_t	zsc_psetid;
221	psetid_t	zsc_psetid_prev;
222	zsd_pset_t	*zsc_pset;
223
224	boolean_t	zsc_found;	/* cpu online in this interval */
225	boolean_t	zsc_onlined;	/* cpu onlined during this interval */
226	boolean_t	zsc_offlined;	/* cpu offlined during this interval */
227	boolean_t	zsc_active;	/* cpu online during this interval */
228	boolean_t	zsc_allocated;	/* True if cpu has ever been found */
229
230	/* kstats this interval */
231	uint64_t	zsc_nsec_idle;
232	uint64_t	zsc_nsec_intr;
233	uint64_t	zsc_nsec_kern;
234	uint64_t	zsc_nsec_user;
235
236	/* kstats in most recent interval */
237	uint64_t	zsc_nsec_idle_prev;
238	uint64_t	zsc_nsec_intr_prev;
239	uint64_t	zsc_nsec_kern_prev;
240	uint64_t	zsc_nsec_user_prev;
241
242	/* Total kstat increases since zonestatd started reading kstats */
243	timestruc_t	zsc_idle;
244	timestruc_t	zsc_intr;
245	timestruc_t	zsc_kern;
246	timestruc_t	zsc_user;
247
248} zsd_cpu_t;
249
250/* Used to describe an individual zone and its utilization */
251typedef struct zsd_zone {
252	zoneid_t	zsz_id;
253	list_node_t	zsz_next;
254	char		zsz_name[ZS_ZONENAME_MAX];
255	uint_t		zsz_cputype;
256	uint_t		zsz_iptype;
257	time_t		zsz_start;
258	hrtime_t	zsz_hrstart;
259
260	char		zsz_pool[ZS_POOLNAME_MAX];
261	char		zsz_pset[ZS_PSETNAME_MAX];
262	int		zsz_default_sched;
263	/* These are deduced by inspecting processes */
264	psetid_t	zsz_psetid;
265	uint_t		zsz_scheds;
266
267	boolean_t	zsz_new;	/* zone booted during this interval */
268	boolean_t	zsz_deleted;	/* halted during this interval */
269	boolean_t	zsz_active;	/* running in this interval */
270	boolean_t	zsz_empty;	/* no processes in this interval */
271	boolean_t	zsz_gone;	/* not installed in this interval */
272	boolean_t	zsz_found;	/* Running at end of this interval */
273
274	uint64_t	zsz_cpu_shares;
275	uint64_t	zsz_cpu_cap;
276	uint64_t	zsz_ram_cap;
277	uint64_t	zsz_locked_cap;
278	uint64_t	zsz_vm_cap;
279
280	uint64_t	zsz_cpus_online;
281	timestruc_t	zsz_cpu_usage;	/* cpu time of cpu cap */
282	timestruc_t	zsz_cap_time;	/* cpu time of cpu cap */
283	timestruc_t	zsz_share_time; /* cpu time of share of cpu */
284	timestruc_t	zsz_pset_time;  /* time of all psets zone is bound to */
285
286	uint64_t	zsz_usage_ram;
287	uint64_t	zsz_usage_locked;
288	uint64_t	zsz_usage_vm;
289
290	uint64_t	zsz_processes_cap;
291	uint64_t	zsz_lwps_cap;
292	uint64_t	zsz_shm_cap;
293	uint64_t	zsz_shmids_cap;
294	uint64_t	zsz_semids_cap;
295	uint64_t	zsz_msgids_cap;
296	uint64_t	zsz_lofi_cap;
297
298	uint64_t	zsz_processes;
299	uint64_t	zsz_lwps;
300	uint64_t	zsz_shm;
301	uint64_t	zsz_shmids;
302	uint64_t	zsz_semids;
303	uint64_t	zsz_msgids;
304	uint64_t	zsz_lofi;
305
306} zsd_zone_t;
307
308/*
309 * Used to track the cpu usage of an individual processes.
310 *
311 * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
312 * to their zone.  As processes exit, their extended accounting records are
313 * read and the difference of their total and known usage is charged to their
314 * zone.
315 *
316 * If a process is never seen in /proc, the total usage on its extended
317 * accounting record will be charged to its zone.
318 */
319typedef struct zsd_proc {
320	list_node_t	zspr_next;
321	pid_t		zspr_ppid;
322	psetid_t	zspr_psetid;
323	zoneid_t	zspr_zoneid;
324	int		zspr_sched;
325	timestruc_t	zspr_usage;
326} zsd_proc_t;
327
328/* Used to track the overall resource usage of the system */
329typedef struct zsd_system {
330
331	uint64_t zss_ram_total;
332	uint64_t zss_ram_kern;
333	uint64_t zss_ram_zones;
334
335	uint64_t zss_locked_kern;
336	uint64_t zss_locked_zones;
337
338	uint64_t zss_vm_total;
339	uint64_t zss_vm_kern;
340	uint64_t zss_vm_zones;
341
342	uint64_t zss_swap_total;
343	uint64_t zss_swap_used;
344
345	timestruc_t zss_idle;
346	timestruc_t zss_intr;
347	timestruc_t zss_kern;
348	timestruc_t zss_user;
349
350	timestruc_t zss_cpu_total_time;
351	timestruc_t zss_cpu_usage_kern;
352	timestruc_t zss_cpu_usage_zones;
353
354	uint64_t zss_maxpid;
355	uint64_t zss_processes_max;
356	uint64_t zss_lwps_max;
357	uint64_t zss_shm_max;
358	uint64_t zss_shmids_max;
359	uint64_t zss_semids_max;
360	uint64_t zss_msgids_max;
361	uint64_t zss_lofi_max;
362
363	uint64_t zss_processes;
364	uint64_t zss_lwps;
365	uint64_t zss_shm;
366	uint64_t zss_shmids;
367	uint64_t zss_semids;
368	uint64_t zss_msgids;
369	uint64_t zss_lofi;
370
371	uint64_t zss_ncpus;
372	uint64_t zss_ncpus_online;
373
374} zsd_system_t;
375
376/*
377 * A dumping ground for various information and structures used to compute
378 * utilization.
379 *
380 * This structure is used to track the system while clients are connected.
381 * When The first client connects, a zsd_ctl is allocated and configured by
382 * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
383 */
384typedef struct zsd_ctl {
385	kstat_ctl_t	*zsctl_kstat_ctl;
386
387	/* To track extended accounting */
388	int		zsctl_proc_fd;		/* Log currently being used */
389	ea_file_t	zsctl_proc_eaf;
390	struct stat64	zsctl_proc_stat;
391	int		zsctl_proc_open;
392	int		zsctl_proc_fd_next;	/* Log file to use next */
393	ea_file_t	zsctl_proc_eaf_next;
394	struct stat64	zsctl_proc_stat_next;
395	int		zsctl_proc_open_next;
396
397	/* pool configuration handle */
398	pool_conf_t	*zsctl_pool_conf;
399	int		zsctl_pool_status;
400	int		zsctl_pool_changed;
401
402	/* The above usage tacking structures */
403	zsd_system_t	*zsctl_system;
404	list_t		zsctl_zones;
405	list_t		zsctl_psets;
406	list_t		zsctl_cpus;
407	zsd_cpu_t	*zsctl_cpu_array;
408	zsd_proc_t	*zsctl_proc_array;
409
410	/* Various system info */
411	uint64_t	zsctl_maxcpuid;
412	uint64_t	zsctl_maxproc;
413	uint64_t	zsctl_kern_bits;
414	uint64_t	zsctl_pagesize;
415
416	/* Used to track time available under a cpu cap. */
417	uint64_t	zsctl_hrtime;
418	uint64_t	zsctl_hrtime_prev;
419	timestruc_t	zsctl_hrtime_total;
420
421	struct timeval	zsctl_timeofday;
422
423	/* Caches for arrays allocated for use by various system calls */
424	psetid_t	*zsctl_pset_cache;
425	uint_t		zsctl_pset_ncache;
426	processorid_t	*zsctl_cpu_cache;
427	uint_t		zsctl_cpu_ncache;
428	zoneid_t	*zsctl_zone_cache;
429	uint_t		zsctl_zone_ncache;
430	struct swaptable *zsctl_swap_cache;
431	uint64_t	zsctl_swap_cache_size;
432	uint64_t	zsctl_swap_cache_num;
433	zsd_vmusage64_t	*zsctl_vmusage_cache;
434	uint64_t	zsctl_vmusage_cache_num;
435
436	/* Info about procfs for scanning /proc */
437	struct dirent	*zsctl_procfs_dent;
438	long		zsctl_procfs_dent_size;
439	pool_value_t	*zsctl_pool_vals[3];
440
441	/* Counts on tracked entities */
442	uint_t		zsctl_nzones;
443	uint_t		zsctl_npsets;
444	uint_t		zsctl_npset_usages;
445} zsd_ctl_t;
446
447zsd_ctl_t		*g_ctl;
448boolean_t		g_open;		/* True if g_ctl is open */
449int			g_hasclient;	/* True if any clients are connected */
450
451/*
452 * The usage cache is updated by the stat_thread, and copied to clients by
453 * the zsd_stat_server.  Mutex and cond are to synchronize between the
454 * stat_thread and the stat_server.
455 */
456zs_usage_cache_t	*g_usage_cache;
457mutex_t			g_usage_cache_lock;
458cond_t			g_usage_cache_kick;
459uint_t			g_usage_cache_kickers;
460cond_t			g_usage_cache_wait;
461char			*g_usage_cache_buf;
462uint_t			g_usage_cache_bufsz;
463uint64_t		g_gen_next;
464
465/* fds of door servers */
466int			g_server_door;
467int			g_stat_door;
468
469/*
470 * Starting and current time.  Used to throttle memory calculation, and to
471 * mark new zones and psets with their boot and creation time.
472 */
473time_t			g_now;
474time_t			g_start;
475hrtime_t		g_hrnow;
476hrtime_t		g_hrstart;
477uint64_t		g_interval;
478
479/*
480 * main() thread.
481 */
482thread_t		g_main;
483
484/* PRINTFLIKE1 */
485static void
486zsd_warn(const char *fmt, ...)
487{
488	va_list alist;
489
490	va_start(alist, fmt);
491
492	(void) fprintf(stderr, gettext("zonestat: Warning: "));
493	(void) vfprintf(stderr, fmt, alist);
494	(void) fprintf(stderr, "\n");
495	va_end(alist);
496}
497
498/* PRINTFLIKE1 */
499static void
500zsd_error(const char *fmt, ...)
501{
502	va_list alist;
503
504	va_start(alist, fmt);
505
506	(void) fprintf(stderr, gettext("zonestat: Error: "));
507	(void) vfprintf(stderr, fmt, alist);
508	(void) fprintf(stderr, "\n");
509	va_end(alist);
510	exit(1);
511}
512
513/* Turns on extended accounting if not configured externally */
514int
515zsd_enable_cpu_stats()
516{
517	char *path = ZONESTAT_EXACCT_FILE;
518	char oldfile[MAXPATHLEN];
519	int ret, state = AC_ON;
520	ac_res_t res[6];
521
522	/*
523	 * Start a new accounting file  if accounting not configured
524	 * externally.
525	 */
526
527	res[0].ar_id = AC_PROC_PID;
528	res[0].ar_state = AC_ON;
529	res[1].ar_id = AC_PROC_ANCPID;
530	res[1].ar_state = AC_ON;
531	res[2].ar_id = AC_PROC_CPU;
532	res[2].ar_state = AC_ON;
533	res[3].ar_id = AC_PROC_TIME;
534	res[3].ar_state = AC_ON;
535	res[4].ar_id = AC_PROC_ZONENAME;
536	res[4].ar_state = AC_ON;
537	res[5].ar_id = AC_NONE;
538	res[5].ar_state = AC_ON;
539	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
540		zsd_warn(gettext("Unable to set accounting resources"));
541		return (-1);
542	}
543	/* Only set accounting file if none is configured */
544	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
545	if (ret < 0) {
546
547		(void) unlink(path);
548		if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
549		    == -1) {
550			zsd_warn(gettext("Unable to set accounting file"));
551			return (-1);
552		}
553	}
554	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
555		zsd_warn(gettext("Unable to enable accounting"));
556		return (-1);
557	}
558	return (0);
559}
560
561/* Turns off extended accounting if not configured externally */
562int
563zsd_disable_cpu_stats()
564{
565	char *path = ZONESTAT_EXACCT_FILE;
566	int ret, state = AC_OFF;
567	ac_res_t res[6];
568	char oldfile[MAXPATHLEN];
569
570	/* If accounting file is externally configured, leave it alone */
571	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
572	if (ret == 0 && strcmp(oldfile, path) != 0)
573		return (0);
574
575	res[0].ar_id = AC_PROC_PID;
576	res[0].ar_state = AC_OFF;
577	res[1].ar_id = AC_PROC_ANCPID;
578	res[1].ar_state = AC_OFF;
579	res[2].ar_id = AC_PROC_CPU;
580	res[2].ar_state = AC_OFF;
581	res[3].ar_id = AC_PROC_TIME;
582	res[3].ar_state = AC_OFF;
583	res[4].ar_id = AC_PROC_ZONENAME;
584	res[4].ar_state = AC_OFF;
585	res[5].ar_id = AC_NONE;
586	res[5].ar_state = AC_OFF;
587	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
588		zsd_warn(gettext("Unable to clear accounting resources"));
589		return (-1);
590	}
591	if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
592		zsd_warn(gettext("Unable to clear accounting file"));
593		return (-1);
594	}
595	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
596		zsd_warn(gettext("Unable to diable accounting"));
597		return (-1);
598	}
599
600	(void) unlink(path);
601	return (0);
602}
603
604/*
605 * If not configured externally, deletes the current extended accounting file
606 * and starts a new one.
607 *
608 * Since the stat_thread holds an open handle to the accounting file, it will
609 * read all remaining entries from the old file before switching to
610 * read the new one.
611 */
612int
613zsd_roll_exacct(void)
614{
615	int ret;
616	char *path = ZONESTAT_EXACCT_FILE;
617	char oldfile[MAXPATHLEN];
618
619	/* If accounting file is externally configured, leave it alone */
620	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
621	if (ret == 0 && strcmp(oldfile, path) != 0)
622		return (0);
623
624	if (unlink(path) != 0)
625		/* Roll it next time */
626		return (0);
627
628	if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
629		zsd_warn(gettext("Unable to set accounting file"));
630		return (-1);
631	}
632	return (0);
633}
634
635/* Contract stuff for zone_enter() */
636int
637init_template(void)
638{
639	int fd;
640	int err = 0;
641
642	fd = open64(CTFS_ROOT "/process/template", O_RDWR);
643	if (fd == -1)
644		return (-1);
645
646	/*
647	 * For now, zoneadmd doesn't do anything with the contract.
648	 * Deliver no events, don't inherit, and allow it to be orphaned.
649	 */
650	err |= ct_tmpl_set_critical(fd, 0);
651	err |= ct_tmpl_set_informative(fd, 0);
652	err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
653	err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
654	if (err || ct_tmpl_activate(fd)) {
655		(void) close(fd);
656		return (-1);
657	}
658
659	return (fd);
660}
661
662/*
663 * Contract stuff for zone_enter()
664 */
665int
666contract_latest(ctid_t *id)
667{
668	int cfd, r;
669	ct_stathdl_t st;
670	ctid_t result;
671
672	if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
673		return (errno);
674
675	if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
676		(void) close(cfd);
677		return (r);
678	}
679
680	result = ct_status_get_id(st);
681	ct_status_free(st);
682	(void) close(cfd);
683
684	*id = result;
685	return (0);
686}
687
688static int
689close_on_exec(int fd)
690{
691	int flags = fcntl(fd, F_GETFD, 0);
692	if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
693		return (0);
694	return (-1);
695}
696
697int
698contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
699{
700	char path[PATH_MAX];
701	int n, fd;
702
703	if (type == NULL)
704		type = "all";
705
706	n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
707	if (n >= sizeof (path)) {
708		errno = ENAMETOOLONG;
709		return (-1);
710	}
711
712	fd = open64(path, oflag);
713	if (fd != -1) {
714		if (close_on_exec(fd) == -1) {
715			int err = errno;
716			(void) close(fd);
717			errno = err;
718			return (-1);
719		}
720	}
721	return (fd);
722}
723
724int
725contract_abandon_id(ctid_t ctid)
726{
727	int fd, err;
728
729	fd = contract_open(ctid, "all", "ctl", O_WRONLY);
730	if (fd == -1)
731		return (errno);
732
733	err = ct_ctl_abandon(fd);
734	(void) close(fd);
735
736	return (err);
737}
738/*
739 * Attach the zsd_server to a zone.  Called for each zone when zonestatd
740 * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
741 *
742 * Zone_enter is used to avoid reaching into zone to fattach door.
743 */
744static void
745zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
746{
747	char *path = ZS_DOOR_PATH;
748	int fd, pid, stat, tmpl_fd;
749	ctid_t ct;
750
751	if ((tmpl_fd = init_template()) == -1) {
752		zsd_warn("Unable to init template");
753		return;
754	}
755
756	pid = forkx(0);
757	if (pid < 0) {
758		(void) ct_tmpl_clear(tmpl_fd);
759		zsd_warn(gettext(
760		    "Unable to fork to add zonestat to zoneid %d\n"), zid);
761		return;
762	}
763
764	if (pid == 0) {
765		(void) ct_tmpl_clear(tmpl_fd);
766		(void) close(tmpl_fd);
767		if (zid != 0 && zone_enter(zid) != 0) {
768			if (errno == EINVAL) {
769				_exit(0);
770			}
771			_exit(1);
772		}
773		(void) fdetach(path);
774		(void) unlink(path);
775		if (detach_only)
776			_exit(0);
777		fd = open(path, O_CREAT|O_RDWR, 0644);
778		if (fd < 0)
779			_exit(2);
780		if (fattach(door, path) != 0)
781			_exit(3);
782		_exit(0);
783	}
784	if (contract_latest(&ct) == -1)
785		ct = -1;
786	(void) ct_tmpl_clear(tmpl_fd);
787	(void) close(tmpl_fd);
788	(void) contract_abandon_id(ct);
789	while (waitpid(pid, &stat, 0) != pid)
790		;
791	if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
792		return;
793
794	zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
795
796	if (WEXITSTATUS(stat) == 1)
797		zsd_warn(gettext("Cannot entering zone"));
798	else if (WEXITSTATUS(stat) == 2)
799		zsd_warn(gettext("Unable to create door file: %s"), path);
800	else if (WEXITSTATUS(stat) == 3)
801		zsd_warn(gettext("Unable to fattach file: %s"), path);
802
803	zsd_warn(gettext("Internal error entering zone: %d"), zid);
804}
805
806/*
807 * Zone lookup and allocation functions to manage list of currently running
808 * zones.
809 */
810static zsd_zone_t *
811zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
812{
813	zsd_zone_t *zone;
814
815	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
816	    zone = list_next(&ctl->zsctl_zones, zone)) {
817		if (strcmp(zone->zsz_name, zonename) == 0) {
818			if (zoneid != -1)
819				zone->zsz_id = zoneid;
820			return (zone);
821		}
822	}
823	return (NULL);
824}
825
826static zsd_zone_t *
827zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
828{
829	zsd_zone_t *zone;
830
831	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
832	    zone = list_next(&ctl->zsctl_zones, zone)) {
833		if (zone->zsz_id == zoneid)
834			return (zone);
835	}
836	return (NULL);
837}
838
839static zsd_zone_t *
840zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
841{
842	zsd_zone_t *zone;
843
844	if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
845		return (NULL);
846
847	(void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
848	zone->zsz_id = zoneid;
849	zone->zsz_found = B_FALSE;
850
851	/*
852	 * Allocate as deleted so if not found in first pass, zone is deleted
853	 * from list.  This can happen if zone is returned by zone_list, but
854	 * exits before first attempt to fetch zone details.
855	 */
856	zone->zsz_start = g_now;
857	zone->zsz_hrstart = g_hrnow;
858	zone->zsz_deleted = B_TRUE;
859
860	zone->zsz_cpu_shares = ZS_LIMIT_NONE;
861	zone->zsz_cpu_cap = ZS_LIMIT_NONE;
862	zone->zsz_ram_cap = ZS_LIMIT_NONE;
863	zone->zsz_locked_cap = ZS_LIMIT_NONE;
864	zone->zsz_vm_cap = ZS_LIMIT_NONE;
865
866	zone->zsz_processes_cap = ZS_LIMIT_NONE;
867	zone->zsz_lwps_cap = ZS_LIMIT_NONE;
868	zone->zsz_shm_cap = ZS_LIMIT_NONE;
869	zone->zsz_shmids_cap = ZS_LIMIT_NONE;
870	zone->zsz_semids_cap = ZS_LIMIT_NONE;
871	zone->zsz_msgids_cap = ZS_LIMIT_NONE;
872	zone->zsz_lofi_cap = ZS_LIMIT_NONE;
873
874	ctl->zsctl_nzones++;
875
876	return (zone);
877}
878
879static zsd_zone_t *
880zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
881{
882	zsd_zone_t *zone, *tmp;
883
884	if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
885		return (zone);
886
887	if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
888		return (NULL);
889
890	/* Insert sorted by zonename */
891	tmp = list_head(&ctl->zsctl_zones);
892	while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
893		tmp = list_next(&ctl->zsctl_zones, tmp);
894
895	list_insert_before(&ctl->zsctl_zones, tmp, zone);
896	return (zone);
897}
898
899/*
900 * Mark all zones as not existing.  As zones are found, they will
901 * be marked as existing.  If a zone is not found, then it must have
902 * halted.
903 */
904static void
905zsd_mark_zones_start(zsd_ctl_t *ctl)
906{
907
908	zsd_zone_t *zone;
909
910	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
911	    zone = list_next(&ctl->zsctl_zones, zone)) {
912		zone->zsz_found = B_FALSE;
913	}
914}
915
916/*
917 * Mark each zone as not using pset.  If processes are found using the
918 * pset, the zone will remain bound to the pset.  If none of a zones
919 * processes are bound to the pset, the zone's usage of the pset will
920 * be deleted.
921 *
922 */
923static void
924zsd_mark_pset_usage_start(zsd_pset_t *pset)
925{
926	zsd_pset_usage_t *usage;
927
928	for (usage = list_head(&pset->zsp_usage_list);
929	    usage != NULL;
930	    usage = list_next(&pset->zsp_usage_list, usage)) {
931		usage->zsu_found = B_FALSE;
932		usage->zsu_empty = B_TRUE;
933	}
934}
935
936/*
937 * Mark each pset as not existing.  If a pset is found, it will be marked
938 * as existing.  If a pset is not found, it wil be deleted.
939 */
940static void
941zsd_mark_psets_start(zsd_ctl_t *ctl)
942{
943	zsd_pset_t *pset;
944
945	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
946	    pset = list_next(&ctl->zsctl_psets, pset)) {
947		pset->zsp_found = B_FALSE;
948		zsd_mark_pset_usage_start(pset);
949	}
950}
951
952/*
953 * A pset was found.  Update its information
954 */
955static void
956zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
957    uint64_t size, uint64_t min, uint64_t max, int64_t importance)
958{
959	pset->zsp_empty = B_TRUE;
960	pset->zsp_deleted = B_FALSE;
961
962	assert(pset->zsp_found == B_FALSE);
963
964	/* update pset flags */
965	if (pset->zsp_active == B_FALSE)
966		/* pset not seen on previous interval.  It is new. */
967		pset->zsp_new = B_TRUE;
968	else
969		pset->zsp_new = B_FALSE;
970
971	pset->zsp_found = B_TRUE;
972	pset->zsp_cputype = type;
973	pset->zsp_online = online;
974	pset->zsp_size = size;
975	pset->zsp_min = min;
976	pset->zsp_max = max;
977	pset->zsp_importance = importance;
978	pset->zsp_cpu_shares = 0;
979	pset->zsp_scheds = 0;
980	pset->zsp_active = B_TRUE;
981}
982
983/*
984 * A zone's process was found using a pset. Charge the process to the pset and
985 * the per-zone data for the pset.
986 */
987static void
988zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
989{
990	zsd_zone_t *zone = usage->zsu_zone;
991	zsd_pset_t *pset = usage->zsu_pset;
992
993	/* Nothing to do if already found */
994	if (usage->zsu_found == B_TRUE)
995		goto add_stats;
996
997	usage->zsu_found = B_TRUE;
998	usage->zsu_empty = B_FALSE;
999
1000	usage->zsu_deleted = B_FALSE;
1001	/* update usage flags */
1002	if (usage->zsu_active == B_FALSE)
1003		usage->zsu_new = B_TRUE;
1004	else
1005		usage->zsu_new = B_FALSE;
1006
1007	usage->zsu_scheds = 0;
1008	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1009	usage->zsu_active = B_TRUE;
1010	pset->zsp_empty = B_FALSE;
1011	zone->zsz_empty = B_FALSE;
1012
1013add_stats:
1014	/* Detect zone's pset id, and if it is bound to multiple psets */
1015	if (zone->zsz_psetid == ZS_PSET_ERROR)
1016		zone->zsz_psetid = pset->zsp_id;
1017	else if (zone->zsz_psetid != pset->zsp_id)
1018		zone->zsz_psetid = ZS_PSET_MULTI;
1019
1020	usage->zsu_scheds |= sched;
1021	pset->zsp_scheds |= sched;
1022	zone->zsz_scheds |= sched;
1023
1024	/* Record if FSS is co-habitating with conflicting scheduler */
1025	if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1026	    usage->zsu_scheds & (
1027	    ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1028		usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1029
1030		pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1031	}
1032
1033}
1034
1035/* Add cpu time for a process to a pset, zone, and system totals */
1036static void
1037zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1038{
1039	zsd_system_t *system = ctl->zsctl_system;
1040	zsd_zone_t *zone = usage->zsu_zone;
1041	zsd_pset_t *pset = usage->zsu_pset;
1042
1043	TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1044	TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1045	TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1046	TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1047}
1048
1049/* Determine which processor sets have been deleted */
1050static void
1051zsd_mark_psets_end(zsd_ctl_t *ctl)
1052{
1053	zsd_pset_t *pset, *tmp;
1054
1055	/*
1056	 * Mark pset as not exists, and deleted if it existed
1057	 * previous interval.
1058	 */
1059	pset = list_head(&ctl->zsctl_psets);
1060	while (pset != NULL) {
1061		if (pset->zsp_found == B_FALSE) {
1062			pset->zsp_empty = B_TRUE;
1063			if (pset->zsp_deleted == B_TRUE) {
1064				tmp = pset;
1065				pset = list_next(&ctl->zsctl_psets, pset);
1066				list_remove(&ctl->zsctl_psets, tmp);
1067				free(tmp);
1068				ctl->zsctl_npsets--;
1069				continue;
1070			} else {
1071				/* Pset vanished during this interval */
1072				pset->zsp_new = B_FALSE;
1073				pset->zsp_deleted = B_TRUE;
1074				pset->zsp_active = B_TRUE;
1075			}
1076		}
1077		pset = list_next(&ctl->zsctl_psets, pset);
1078	}
1079}
1080
1081/* Determine which zones are no longer bound to processor sets */
1082static void
1083zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1084{
1085	zsd_pset_t *pset;
1086	zsd_zone_t *zone;
1087	zsd_pset_usage_t *usage, *tmp;
1088
1089	/*
1090	 * Mark pset as not exists, and deleted if it existed previous
1091	 * interval.
1092	 */
1093	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1094	    pset = list_next(&ctl->zsctl_psets, pset)) {
1095		usage = list_head(&pset->zsp_usage_list);
1096		while (usage != NULL) {
1097			/*
1098			 * Mark pset as not exists, and deleted if it existed
1099			 * previous interval.
1100			 */
1101			if (usage->zsu_found == B_FALSE ||
1102			    usage->zsu_zone->zsz_deleted == B_TRUE ||
1103			    usage->zsu_pset->zsp_deleted == B_TRUE) {
1104				tmp = usage;
1105				usage = list_next(&pset->zsp_usage_list,
1106				    usage);
1107				list_remove(&pset->zsp_usage_list, tmp);
1108				free(tmp);
1109				pset->zsp_nusage--;
1110				ctl->zsctl_npset_usages--;
1111				continue;
1112			} else {
1113				usage->zsu_new = B_FALSE;
1114				usage->zsu_deleted = B_TRUE;
1115				usage->zsu_active = B_TRUE;
1116			}
1117			/* Add cpu shares for usages that are in FSS */
1118			zone = usage->zsu_zone;
1119			if (usage->zsu_scheds & ZS_SCHED_FSS &&
1120			    zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1121			    zone->zsz_cpu_shares != 0) {
1122				zone = usage->zsu_zone;
1123				usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1124				pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1125			}
1126			usage = list_next(&pset->zsp_usage_list,
1127			    usage);
1128		}
1129	}
1130}
1131
1132/* A zone has been found.  Update its information */
1133static void
1134zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1135    uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1136    uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1137    uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1138    uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1139    uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1140    uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1141    uint_t iptype)
1142{
1143	zsd_system_t *sys = ctl->zsctl_system;
1144
1145	assert(zone->zsz_found == B_FALSE);
1146
1147	/*
1148	 * Mark zone as exists, and new if it did not exist in previous
1149	 * interval.
1150	 */
1151	zone->zsz_found = B_TRUE;
1152	zone->zsz_empty = B_TRUE;
1153	zone->zsz_deleted = B_FALSE;
1154
1155	/*
1156	 * Zone is new.  Assume zone's properties are the same over entire
1157	 * interval.
1158	 */
1159	if (zone->zsz_active == B_FALSE)
1160		zone->zsz_new = B_TRUE;
1161	else
1162		zone->zsz_new = B_FALSE;
1163
1164	(void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1165	(void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1166	zone->zsz_default_sched = sched;
1167
1168	/* Schedulers updated later as processes are found */
1169	zone->zsz_scheds = 0;
1170
1171	/* Cpus updated later as psets bound are identified */
1172	zone->zsz_cpus_online = 0;
1173
1174	zone->zsz_cputype = cputype;
1175	zone->zsz_iptype = iptype;
1176	zone->zsz_psetid = ZS_PSET_ERROR;
1177	zone->zsz_cpu_cap = cpu_cap;
1178	zone->zsz_cpu_shares = cpu_shares;
1179	zone->zsz_ram_cap = ram_cap;
1180	zone->zsz_locked_cap = locked_cap;
1181	zone->zsz_vm_cap = vm_cap;
1182	zone->zsz_processes_cap = processes_cap;
1183	zone->zsz_processes = processes;
1184	zone->zsz_lwps_cap = lwps_cap;
1185	zone->zsz_lwps = lwps;
1186	zone->zsz_shm_cap = shm_cap;
1187	zone->zsz_shm = shm;
1188	zone->zsz_shmids_cap = shmids_cap;
1189	zone->zsz_shmids = shmids;
1190	zone->zsz_semids_cap = semids_cap;
1191	zone->zsz_semids = semids;
1192	zone->zsz_msgids_cap = msgids_cap;
1193	zone->zsz_msgids = msgids;
1194	zone->zsz_lofi_cap = lofi_cap;
1195	zone->zsz_lofi = lofi;
1196
1197	sys->zss_processes += processes;
1198	sys->zss_lwps += lwps;
1199	sys->zss_shm += shm;
1200	sys->zss_shmids += shmids;
1201	sys->zss_semids += semids;
1202	sys->zss_msgids += msgids;
1203	sys->zss_lofi += lofi;
1204	zone->zsz_active = B_TRUE;
1205}
1206
1207
1208/* Determine which zones have halted */
1209static void
1210zsd_mark_zones_end(zsd_ctl_t *ctl)
1211{
1212	zsd_zone_t *zone, *tmp;
1213
1214	/*
1215	 * Mark zone as not existing, or delete if it did not exist in
1216	 * previous interval.
1217	 */
1218	zone = list_head(&ctl->zsctl_zones);
1219	while (zone != NULL) {
1220		if (zone->zsz_found == B_FALSE) {
1221			zone->zsz_empty = B_TRUE;
1222			if (zone->zsz_deleted == B_TRUE) {
1223				/*
1224				 * Zone deleted in prior interval,
1225				 * so it no longer exists.
1226				 */
1227				tmp = zone;
1228				zone = list_next(&ctl->zsctl_zones, zone);
1229				list_remove(&ctl->zsctl_zones, tmp);
1230				free(tmp);
1231				ctl->zsctl_nzones--;
1232				continue;
1233			} else {
1234				zone->zsz_new = B_FALSE;
1235				zone->zsz_deleted = B_TRUE;
1236				zone->zsz_active = B_TRUE;
1237			}
1238		}
1239		zone = list_next(&ctl->zsctl_zones, zone);
1240	}
1241}
1242
1243/*
1244 * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
1245 * a cpu is not found, then it must have gone offline, so it will be
1246 * deleted.
1247 *
1248 * The kstat tracking data is rolled so that the usage since the previous
1249 * interval can be determined.
1250 */
1251static void
1252zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1253{
1254	zsd_cpu_t *cpu;
1255
1256	/*
1257	 * Mark all cpus as not existing.  As cpus are found, they will
1258	 * be marked as existing.
1259	 */
1260	for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1261	    cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1262		cpu->zsc_found = B_FALSE;
1263		if (cpu->zsc_active == B_TRUE && roll) {
1264			cpu->zsc_psetid_prev = cpu->zsc_psetid;
1265			cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1266			cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1267			cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1268			cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1269		}
1270	}
1271}
1272
1273/*
1274 * An array the size of the maximum number of cpus is kept.  Within this array
1275 * a list of the online cpus is maintained.
1276 */
1277zsd_cpu_t *
1278zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1279{
1280	zsd_cpu_t *cpu;
1281
1282	assert(cpuid < ctl->zsctl_maxcpuid);
1283	cpu = &(ctl->zsctl_cpu_array[cpuid]);
1284	assert(cpuid == cpu->zsc_id);
1285
1286	if (cpu->zsc_allocated == B_FALSE) {
1287		cpu->zsc_allocated = B_TRUE;
1288		list_insert_tail(&ctl->zsctl_cpus, cpu);
1289	}
1290	return (cpu);
1291}
1292
1293/* A cpu has been found.  Update its information */
1294static void
1295zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1296{
1297	/*
1298	 * legacy processor sets, the cpu may move while zonestatd is
1299	 * inspecting, causing it to be found twice.  In this case, just
1300	 * leave cpu in the first processor set in which it was found.
1301	 */
1302	if (cpu->zsc_found == B_TRUE)
1303		return;
1304
1305	/* Mark cpu as online */
1306	cpu->zsc_found = B_TRUE;
1307	cpu->zsc_offlined = B_FALSE;
1308	cpu->zsc_pset = pset;
1309	/*
1310	 * cpu is newly online.
1311	 */
1312	if (cpu->zsc_active == B_FALSE) {
1313		/*
1314		 * Cpu is newly online.
1315		 */
1316		cpu->zsc_onlined = B_TRUE;
1317		cpu->zsc_psetid = psetid;
1318		cpu->zsc_psetid_prev = psetid;
1319	} else {
1320		/*
1321		 * cpu online during previous interval.  Save properties at
1322		 * start of interval
1323		 */
1324		cpu->zsc_onlined = B_FALSE;
1325		cpu->zsc_psetid = psetid;
1326
1327	}
1328	cpu->zsc_active = B_TRUE;
1329}
1330
1331/* Remove all offlined cpus from the list of tracked cpus */
1332static void
1333zsd_mark_cpus_end(zsd_ctl_t *ctl)
1334{
1335	zsd_cpu_t *cpu, *tmp;
1336	int id;
1337
1338	/* Mark cpu as online or offline */
1339	cpu = list_head(&ctl->zsctl_cpus);
1340	while (cpu != NULL) {
1341		if (cpu->zsc_found == B_FALSE) {
1342			if (cpu->zsc_offlined == B_TRUE) {
1343				/*
1344				 * cpu offlined in prior interval. It is gone.
1345				 */
1346				tmp = cpu;
1347				cpu = list_next(&ctl->zsctl_cpus, cpu);
1348				list_remove(&ctl->zsctl_cpus, tmp);
1349				/* Clear structure for future use */
1350				id = tmp->zsc_id;
1351				bzero(tmp, sizeof (zsd_cpu_t));
1352				tmp->zsc_id = id;
1353				tmp->zsc_allocated = B_FALSE;
1354				tmp->zsc_psetid = ZS_PSET_ERROR;
1355				tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1356
1357			} else {
1358				/*
1359				 * cpu online at start of interval.  Treat
1360				 * as still online, since it was online for
1361				 * some portion of the interval.
1362				 */
1363				cpu->zsc_offlined = B_TRUE;
1364				cpu->zsc_onlined = B_FALSE;
1365				cpu->zsc_active = B_TRUE;
1366				cpu->zsc_psetid = cpu->zsc_psetid_prev;
1367				cpu->zsc_pset = NULL;
1368			}
1369		}
1370		cpu = list_next(&ctl->zsctl_cpus, cpu);
1371	}
1372}
1373
1374/* Some utility functions for managing the list of processor sets */
1375static zsd_pset_t *
1376zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1377{
1378	zsd_pset_t *pset;
1379
1380	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1381	    pset = list_next(&ctl->zsctl_psets, pset)) {
1382		if (pset->zsp_id == psetid)
1383			return (pset);
1384	}
1385	return (NULL);
1386}
1387
1388static zsd_pset_t *
1389zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1390{
1391	zsd_pset_t *pset;
1392
1393	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1394	    pset = list_next(&ctl->zsctl_psets, pset)) {
1395		if (strcmp(pset->zsp_name, psetname) == 0) {
1396			if (psetid != -1)
1397				pset->zsp_id = psetid;
1398			return (pset);
1399		}
1400	}
1401	return (NULL);
1402}
1403
1404static zsd_pset_t *
1405zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1406{
1407	zsd_pset_t *pset;
1408
1409	if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1410		return (NULL);
1411
1412	(void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1413	pset->zsp_id = psetid;
1414	pset->zsp_found = B_FALSE;
1415	/*
1416	 * Allocate as deleted so if not found in first pass, pset is deleted
1417	 * from list.  This can happen if pset is returned by pset_list, but
1418	 * is destroyed before first attempt to fetch pset details.
1419	 */
1420	list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1421	    offsetof(zsd_pset_usage_t, zsu_next));
1422
1423	pset->zsp_hrstart = g_hrnow;
1424	pset->zsp_deleted = B_TRUE;
1425	pset->zsp_empty = B_TRUE;
1426	ctl->zsctl_npsets++;
1427
1428	return (pset);
1429}
1430
1431static zsd_pset_t *
1432zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1433{
1434	zsd_pset_t *pset, *tmp;
1435
1436	if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1437		return (pset);
1438
1439	if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1440		return (NULL);
1441
1442	/* Insert sorted by psetname */
1443	tmp = list_head(&ctl->zsctl_psets);
1444	while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1445		tmp = list_next(&ctl->zsctl_psets, tmp);
1446
1447	list_insert_before(&ctl->zsctl_psets, tmp, pset);
1448	return (pset);
1449}
1450
1451/* Some utility functions for managing the list of zones using each pset */
1452static zsd_pset_usage_t *
1453zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1454{
1455	zsd_pset_usage_t *usage;
1456
1457	for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1458	    usage = list_next(&pset->zsp_usage_list, usage))
1459		if (usage->zsu_zone == zone)
1460			return (usage);
1461
1462	return (NULL);
1463}
1464
1465static zsd_pset_usage_t *
1466zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1467{
1468	zsd_pset_usage_t *usage;
1469
1470	if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1471	    == NULL)
1472		return (NULL);
1473
1474	list_link_init(&usage->zsu_next);
1475	usage->zsu_zone = zone;
1476	usage->zsu_zoneid = zone->zsz_id;
1477	usage->zsu_pset = pset;
1478	usage->zsu_found = B_FALSE;
1479	usage->zsu_active = B_FALSE;
1480	usage->zsu_new = B_FALSE;
1481	/*
1482	 * Allocate as not deleted.  If a process is found in a pset for
1483	 * a zone, the usage will not be deleted until at least the next
1484	 * interval.
1485	 */
1486	usage->zsu_start = g_now;
1487	usage->zsu_hrstart = g_hrnow;
1488	usage->zsu_deleted = B_FALSE;
1489	usage->zsu_empty = B_TRUE;
1490	usage->zsu_scheds = 0;
1491	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1492
1493	ctl->zsctl_npset_usages++;
1494	pset->zsp_nusage++;
1495
1496	return (usage);
1497}
1498
1499static zsd_pset_usage_t *
1500zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1501{
1502	zsd_pset_usage_t *usage, *tmp;
1503
1504	if ((usage = zsd_lookup_usage(pset, zone))
1505	    != NULL)
1506		return (usage);
1507
1508	if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1509		return (NULL);
1510
1511	tmp = list_head(&pset->zsp_usage_list);
1512	while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1513	    > 0)
1514		tmp = list_next(&pset->zsp_usage_list, tmp);
1515
1516	list_insert_before(&pset->zsp_usage_list, tmp, usage);
1517	return (usage);
1518}
1519
1520static void
1521zsd_refresh_system(zsd_ctl_t *ctl)
1522{
1523	zsd_system_t *system = ctl->zsctl_system;
1524
1525	/* Re-count these values each interval */
1526	system->zss_processes = 0;
1527	system->zss_lwps = 0;
1528	system->zss_shm = 0;
1529	system->zss_shmids = 0;
1530	system->zss_semids = 0;
1531	system->zss_msgids = 0;
1532	system->zss_lofi = 0;
1533}
1534
1535
1536/* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1537static void
1538zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1539{
1540	zsd_system_t *sys;
1541	processorid_t cpuid;
1542	zsd_pset_t *pset_prev;
1543	zsd_pset_t *pset;
1544	kstat_t *kstat;
1545	kstat_named_t *knp;
1546	kid_t kid;
1547	uint64_t idle, intr, kern, user;
1548
1549	sys = ctl->zsctl_system;
1550	pset = cpu->zsc_pset;
1551	knp = NULL;
1552	kid = -1;
1553	cpuid = cpu->zsc_id;
1554
1555	/* Get the cpu time totals for this cpu */
1556	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1557	if (kstat == NULL)
1558		return;
1559
1560	kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1561	if (kid == -1)
1562		return;
1563
1564	knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1565	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1566		return;
1567
1568	idle = knp->value.ui64;
1569
1570	knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1571	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1572		return;
1573
1574	kern = knp->value.ui64;
1575
1576	knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1577	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1578		return;
1579
1580	user = knp->value.ui64;
1581
1582	/*
1583	 * Tracking intr time per cpu just exists for future enhancements.
1584	 * The value is presently always zero.
1585	 */
1586	intr = 0;
1587	cpu->zsc_nsec_idle = idle;
1588	cpu->zsc_nsec_intr = intr;
1589	cpu->zsc_nsec_kern = kern;
1590	cpu->zsc_nsec_user = user;
1591
1592	if (cpu->zsc_onlined == B_TRUE) {
1593		/*
1594		 * cpu is newly online.  There is no reference value,
1595		 * so just record its current stats for comparison
1596		 * on next stat read.
1597		 */
1598		cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1599		cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1600		cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1601		cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1602		return;
1603	}
1604
1605	/*
1606	 * Calculate relative time since previous refresh.
1607	 * Paranoia.  Don't let time  go backwards.
1608	 */
1609	idle = intr = kern = user = 0;
1610	if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1611		idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1612
1613	if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1614		intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1615
1616	if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1617		kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1618
1619	if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1620		user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1621
1622	/* Update totals for cpu usage */
1623	TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1624	TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1625	TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1626	TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1627
1628	/*
1629	 * Add cpu's stats to its pset if it is known to be in
1630	 * the pset since previous read.
1631	 */
1632	if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1633	    cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1634	    (pset_prev = zsd_lookup_pset_byid(ctl,
1635	    cpu->zsc_psetid_prev)) == NULL) {
1636		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1637		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1638		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1639		TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1640	} else {
1641		/*
1642		 * Last pset was different than current pset.
1643		 * Best guess is to split usage between the two.
1644		 */
1645		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1646		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1647		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1648		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1649
1650		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1651		    (idle / 2) + (idle % 2));
1652		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1653		    (intr / 2) + (intr % 2));
1654		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1655		    (kern / 2) + (kern % 2));
1656		TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1657		    (user / 2) + (user % 2));
1658	}
1659	TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1660	TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1661	TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1662	TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1663}
1664
1665/* Determine the details of a processor set by pset_id */
1666static int
1667zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1668    size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1669    uint64_t *min, uint64_t *max, int64_t *importance)
1670{
1671	uint_t old, num;
1672
1673	pool_conf_t *conf = ctl->zsctl_pool_conf;
1674	pool_value_t **vals = ctl->zsctl_pool_vals;
1675	pool_resource_t **res_list = NULL;
1676	pool_resource_t *pset;
1677	pool_component_t **cpus = NULL;
1678	processorid_t *cache;
1679	const char *string;
1680	uint64_t uint64;
1681	int64_t int64;
1682	int i, ret, type;
1683
1684	if (ctl->zsctl_pool_status == POOL_DISABLED) {
1685
1686		/*
1687		 * Inspect legacy psets
1688		 */
1689		for (;;) {
1690			old = num = ctl->zsctl_cpu_ncache;
1691			ret = pset_info(psetid, &type, &num,
1692			    ctl->zsctl_cpu_cache);
1693			if (ret < 0) {
1694				/* pset is gone.  Tell caller to retry */
1695				errno = EINTR;
1696				return (-1);
1697			}
1698			if (num <= old) {
1699			/* Success */
1700				break;
1701			}
1702			if ((cache = (processorid_t *)realloc(
1703			    ctl->zsctl_cpu_cache, num *
1704			    sizeof (processorid_t))) != NULL) {
1705				ctl->zsctl_cpu_ncache = num;
1706				ctl->zsctl_cpu_cache = cache;
1707			} else {
1708				/*
1709				 * Could not allocate to get new cpu list.
1710				 */
1711				zsd_warn(gettext(
1712				    "Could not allocate for cpu list"));
1713				errno = ENOMEM;
1714				return (-1);
1715			}
1716		}
1717		/*
1718		 * Old school pset.  Just make min and max equal
1719		 * to its size
1720		 */
1721		if (psetid == ZS_PSET_DEFAULT) {
1722			*cputype = ZS_CPUTYPE_DEFAULT_PSET;
1723			(void) strlcpy(psetname, "pset_default", namelen);
1724		} else {
1725			*cputype = ZS_CPUTYPE_PSRSET_PSET;
1726			(void) snprintf(psetname, namelen,
1727			    "SUNWlegacy_pset_%d", psetid);
1728		}
1729
1730		/*
1731		 * Just treat legacy pset as a simple pool pset
1732		 */
1733		*online = num;
1734		*size = num;
1735		*min = num;
1736		*max = num;
1737		*importance = 1;
1738
1739		return (0);
1740	}
1741
1742	/* Look up the pool pset using the pset id */
1743	res_list = NULL;
1744	pool_value_set_int64(vals[1], psetid);
1745	if (pool_value_set_name(vals[1], "pset.sys_id")
1746	    != PO_SUCCESS)
1747		goto err;
1748
1749	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1750		goto err;
1751	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1752		goto err;
1753	if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1754		goto err;
1755	if (num != 1)
1756		goto err;
1757	pset = res_list[0];
1758	free(res_list);
1759	res_list = NULL;
1760	if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1761	    "pset.name", vals[0]) != POC_STRING ||
1762	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1763		goto err;
1764
1765	(void) strlcpy(psetname, string, namelen);
1766	if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1767		*cputype = ZS_CPUTYPE_DEDICATED;
1768	else if (psetid == ZS_PSET_DEFAULT)
1769		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
1770	else
1771		*cputype = ZS_CPUTYPE_POOL_PSET;
1772
1773	/* Get size, min, max, and importance */
1774	if (pool_get_property(conf, pool_resource_to_elem(conf,
1775	    pset), "pset.size", vals[0]) == POC_UINT &&
1776	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1777		*size = uint64;
1778	else
1779		*size = 0;
1780
1781		/* Get size, min, max, and importance */
1782	if (pool_get_property(conf, pool_resource_to_elem(conf,
1783	    pset), "pset.min", vals[0]) == POC_UINT &&
1784	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1785		*min = uint64;
1786	else
1787		*min = 0;
1788	if (*min >= ZSD_PSET_UNLIMITED)
1789		*min = ZS_LIMIT_NONE;
1790
1791	if (pool_get_property(conf, pool_resource_to_elem(conf,
1792	    pset), "pset.max", vals[0]) == POC_UINT &&
1793	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1794		*max = uint64;
1795	else
1796		*max = ZS_LIMIT_NONE;
1797
1798	if (*max >= ZSD_PSET_UNLIMITED)
1799		*max = ZS_LIMIT_NONE;
1800
1801	if (pool_get_property(conf, pool_resource_to_elem(conf,
1802	    pset), "pset.importance", vals[0]) == POC_INT &&
1803	    pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1804		*importance = int64;
1805	else
1806		*importance = (uint64_t)1;
1807
1808	*online = 0;
1809	if (*size == 0)
1810		return (0);
1811
1812	/* get cpus */
1813	cpus = pool_query_resource_components(conf, pset, &num, NULL);
1814	if (cpus == NULL)
1815		goto err;
1816
1817	/* Make sure there is space for cpu id list */
1818	if (num > ctl->zsctl_cpu_ncache) {
1819		if ((cache = (processorid_t *)realloc(
1820		    ctl->zsctl_cpu_cache, num *
1821		    sizeof (processorid_t))) != NULL) {
1822			ctl->zsctl_cpu_ncache = num;
1823			ctl->zsctl_cpu_cache = cache;
1824		} else {
1825			/*
1826			 * Could not allocate to get new cpu list.
1827			 */
1828			zsd_warn(gettext(
1829			    "Could not allocate for cpu list"));
1830			goto err;
1831		}
1832	}
1833
1834	/* count the online cpus */
1835	for (i = 0; i < num; i++) {
1836		if (pool_get_property(conf, pool_component_to_elem(
1837		    conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1838		    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1839			goto err;
1840
1841		if (strcmp(string, "on-line") != 0 &&
1842		    strcmp(string, "no-intr") != 0)
1843			continue;
1844
1845		if (pool_get_property(conf, pool_component_to_elem(
1846		    conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1847		    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1848			goto err;
1849
1850		(*online)++;
1851		ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1852	}
1853	free(cpus);
1854	return (0);
1855err:
1856	if (res_list != NULL)
1857		free(res_list);
1858	if (cpus != NULL)
1859		free(cpus);
1860
1861	/*
1862	 * The pools operations should succeed since the conf is a consistent
1863	 * snapshot.  Tell caller there is no need to retry.
1864	 */
1865	errno = EINVAL;
1866	return (-1);
1867}
1868
1869/*
1870 * Update the current list of processor sets.
1871 * This also updates the list of online cpus, and each cpu's pset membership.
1872 */
1873static void
1874zsd_refresh_psets(zsd_ctl_t *ctl)
1875{
1876	int i, j, ret, state;
1877	uint_t old, num;
1878	uint_t cputype;
1879	int64_t sys_id, importance;
1880	uint64_t online, size, min, max;
1881	zsd_system_t *system;
1882	zsd_pset_t *pset;
1883	zsd_cpu_t *cpu;
1884	psetid_t *cache;
1885	char psetname[ZS_PSETNAME_MAX];
1886	processorid_t cpuid;
1887	pool_value_t *pv_save = NULL;
1888	pool_resource_t **res_list = NULL;
1889	pool_resource_t *res;
1890	pool_value_t **vals;
1891	pool_conf_t *conf;
1892	boolean_t roll_cpus = B_TRUE;
1893
1894	/* Zero cpu counters to recount them */
1895	system = ctl->zsctl_system;
1896	system->zss_ncpus = 0;
1897	system->zss_ncpus_online = 0;
1898retry:
1899	ret = pool_get_status(&state);
1900	if (ret == 0 && state == POOL_ENABLED) {
1901
1902		conf = ctl->zsctl_pool_conf;
1903		vals = ctl->zsctl_pool_vals;
1904		pv_save = vals[1];
1905		vals[1] = NULL;
1906
1907		if (ctl->zsctl_pool_status == POOL_DISABLED) {
1908			if (pool_conf_open(ctl->zsctl_pool_conf,
1909			    pool_dynamic_location(), PO_RDONLY) == 0) {
1910				ctl->zsctl_pool_status = POOL_ENABLED;
1911				ctl->zsctl_pool_changed = POU_PSET;
1912			}
1913		} else {
1914			ctl->zsctl_pool_changed = 0;
1915			ret = pool_conf_update(ctl->zsctl_pool_conf,
1916			    &(ctl->zsctl_pool_changed));
1917			if (ret < 0) {
1918				/* Pools must have become disabled */
1919				(void) pool_conf_close(ctl->zsctl_pool_conf);
1920				ctl->zsctl_pool_status = POOL_DISABLED;
1921				if (pool_error() == POE_SYSTEM && errno ==
1922				    ENOTACTIVE)
1923					goto retry;
1924
1925				zsd_warn(gettext(
1926				    "Unable to update pool configuration"));
1927				/* Not able to get pool info.  Don't update. */
1928				goto err;
1929			}
1930		}
1931		/* Get the list of psets using libpool */
1932		if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1933			goto err;
1934
1935		if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1936			goto err;
1937		if ((res_list = pool_query_resources(conf, &num, vals))
1938		    == NULL)
1939			goto err;
1940
1941		if (num > ctl->zsctl_pset_ncache)  {
1942			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1943			    (num) * sizeof (psetid_t))) == NULL) {
1944				goto err;
1945			}
1946			ctl->zsctl_pset_ncache = num;
1947			ctl->zsctl_pset_cache = cache;
1948		}
1949		/* Save the pset id of each pset */
1950		for (i = 0; i < num; i++) {
1951			res = res_list[i];
1952			if (pool_get_property(conf, pool_resource_to_elem(conf,
1953			    res), "pset.sys_id", vals[0]) != POC_INT ||
1954			    pool_value_get_int64(vals[0], &sys_id)
1955			    != PO_SUCCESS)
1956				goto err;
1957			ctl->zsctl_pset_cache[i] = (int)sys_id;
1958		}
1959		vals[1] = pv_save;
1960		pv_save = NULL;
1961	} else {
1962		if (ctl->zsctl_pool_status == POOL_ENABLED) {
1963			(void) pool_conf_close(ctl->zsctl_pool_conf);
1964			ctl->zsctl_pool_status = POOL_DISABLED;
1965		}
1966		/* Get the pset list using legacy psets */
1967		for (;;) {
1968			old = num = ctl->zsctl_pset_ncache;
1969			(void) pset_list(ctl->zsctl_pset_cache, &num);
1970			if ((num + 1) <= old) {
1971				break;
1972			}
1973			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1974			    (num + 1) * sizeof (psetid_t))) != NULL) {
1975				ctl->zsctl_pset_ncache = num + 1;
1976				ctl->zsctl_pset_cache = cache;
1977			} else {
1978				/*
1979				 * Could not allocate to get new pset list.
1980				 * Give up
1981				 */
1982				return;
1983			}
1984		}
1985		/* Add the default pset to list */
1986		ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1987		ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1988		num++;
1989	}
1990psets_changed:
1991	zsd_mark_cpus_start(ctl, roll_cpus);
1992	zsd_mark_psets_start(ctl);
1993	roll_cpus = B_FALSE;
1994
1995	/* Refresh cpu membership of all psets */
1996	for (i = 0; i < num; i++) {
1997
1998		/* Get pool pset information */
1999		sys_id = ctl->zsctl_pset_cache[i];
2000		if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
2001		    &cputype, &online, &size, &min, &max, &importance)
2002		    != 0) {
2003			if (errno == EINTR)
2004				goto psets_changed;
2005			zsd_warn(gettext("Failed to get info for pset %d"),
2006			    sys_id);
2007			continue;
2008		}
2009
2010		system->zss_ncpus += size;
2011		system->zss_ncpus_online += online;
2012
2013		pset = zsd_lookup_insert_pset(ctl, psetname,
2014		    ctl->zsctl_pset_cache[i]);
2015
2016		/* update pset info */
2017		zsd_mark_pset_found(pset, cputype, online, size, min,
2018		    max, importance);
2019
2020		/* update each cpu in pset */
2021		for (j = 0; j < pset->zsp_online; j++) {
2022			cpuid = ctl->zsctl_cpu_cache[j];
2023			cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2024			zsd_mark_cpu_found(cpu, pset, sys_id);
2025		}
2026	}
2027err:
2028	if (res_list != NULL)
2029		free(res_list);
2030	if (pv_save != NULL)
2031		vals[1] = pv_save;
2032}
2033
2034
2035
2036/*
2037 * Fetch the current pool and pset name for the given zone.
2038 */
2039static void
2040zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2041    char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2042{
2043	poolid_t poolid;
2044	pool_t **pools = NULL;
2045	pool_resource_t **res_list = NULL;
2046	char poolname[ZS_POOLNAME_MAX];
2047	char psetname[ZS_PSETNAME_MAX];
2048	pool_conf_t *conf = ctl->zsctl_pool_conf;
2049	pool_value_t *pv_save = NULL;
2050	pool_value_t **vals = ctl->zsctl_pool_vals;
2051	const char *string;
2052	int ret;
2053	int64_t int64;
2054	uint_t num;
2055
2056	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2057	    &poolid, sizeof (poolid));
2058	if (ret < 0)
2059		goto lookup_done;
2060
2061	pv_save = vals[1];
2062	vals[1] = NULL;
2063	pools = NULL;
2064	res_list = NULL;
2065
2066	/* Default values if lookup fails */
2067	(void) strlcpy(poolname, "pool_default", sizeof (poolname));
2068	(void) strlcpy(psetname, "pset_default", sizeof (poolname));
2069	*cputype = ZS_CPUTYPE_DEFAULT_PSET;
2070
2071	/* no dedicated cpu if pools are disabled */
2072	if (ctl->zsctl_pool_status == POOL_DISABLED)
2073		goto lookup_done;
2074
2075	/* Get the pool name using the id */
2076	pool_value_set_int64(vals[0], poolid);
2077	if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2078		goto lookup_done;
2079
2080	if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2081		goto lookup_done;
2082
2083	if (num != 1)
2084		goto lookup_done;
2085
2086	if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2087	    "pool.name", vals[0]) != POC_STRING ||
2088	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2089		goto lookup_done;
2090	(void) strlcpy(poolname, (char *)string, sizeof (poolname));
2091
2092	/* Get the name of the pset for the pool */
2093	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2094		goto lookup_done;
2095
2096	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2097		goto lookup_done;
2098
2099	if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2100	    == NULL)
2101		goto lookup_done;
2102
2103	if (num != 1)
2104		goto lookup_done;
2105
2106	if (pool_get_property(conf, pool_resource_to_elem(conf,
2107	    res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2108	    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2109		goto lookup_done;
2110
2111	if (int64 == ZS_PSET_DEFAULT)
2112		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
2113
2114	if (pool_get_property(conf, pool_resource_to_elem(conf,
2115	    res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2116	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2117		goto lookup_done;
2118
2119	(void) strlcpy(psetname, (char *)string, sizeof (psetname));
2120
2121	if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2122		*cputype = ZS_CPUTYPE_DEDICATED;
2123	if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2124		*cputype = ZS_CPUTYPE_PSRSET_PSET;
2125	else
2126		*cputype = ZS_CPUTYPE_POOL_PSET;
2127
2128lookup_done:
2129
2130	if (pv_save != NULL)
2131		vals[1] = pv_save;
2132
2133	if (res_list)
2134		free(res_list);
2135	if (pools)
2136		free(pools);
2137
2138	(void) strlcpy(pool, poolname, poollen);
2139	(void) strlcpy(pset, psetname, psetlen);
2140}
2141
2142/* Convert scheduler names to ZS_* scheduler flags */
2143static uint_t
2144zsd_schedname2int(char *clname, int pri)
2145{
2146	uint_t sched = 0;
2147
2148	if (strcmp(clname, "TS") == 0) {
2149		sched = ZS_SCHED_TS;
2150	} else if (strcmp(clname, "IA") == 0) {
2151		sched = ZS_SCHED_IA;
2152	} else if (strcmp(clname, "FX") == 0) {
2153		if (pri > 59) {
2154			sched = ZS_SCHED_FX_60;
2155		} else {
2156			sched = ZS_SCHED_FX;
2157		}
2158	} else if (strcmp(clname, "RT") == 0) {
2159		sched = ZS_SCHED_RT;
2160
2161	} else if (strcmp(clname, "FSS") == 0) {
2162		sched = ZS_SCHED_FSS;
2163	}
2164	return (sched);
2165}
2166
2167static uint64_t
2168zsd_get_zone_rctl_limit(char *name)
2169{
2170	rctlblk_t *rblk;
2171
2172	rblk = (rctlblk_t *)alloca(rctlblk_size());
2173	if (getrctl(name, NULL, rblk, RCTL_FIRST)
2174	    != 0) {
2175		return (ZS_LIMIT_NONE);
2176	}
2177	return (rctlblk_get_value(rblk));
2178}
2179
2180static uint64_t
2181zsd_get_zone_rctl_usage(char *name)
2182{
2183	rctlblk_t *rblk;
2184
2185	rblk = (rctlblk_t *)alloca(rctlblk_size());
2186	if (getrctl(name, NULL, rblk, RCTL_USAGE)
2187	    != 0) {
2188		return (0);
2189	}
2190	return (rctlblk_get_value(rblk));
2191}
2192
2193#define	ZSD_NUM_RCTL_VALS 19
2194
2195/*
2196 * Fetch the limit information for a zone.  This uses zone_enter() as the
2197 * getrctl(2) system call only returns rctl information for the zone of
2198 * the caller.
2199 */
2200static int
2201zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2202    uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2203    uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2204    uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2205    uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2206    uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2207    uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2208{
2209	int p[2], pid, tmpl_fd, ret;
2210	ctid_t ct;
2211	char class[PC_CLNMSZ];
2212	uint64_t vals[ZSD_NUM_RCTL_VALS];
2213	zsd_system_t *sys = ctl->zsctl_system;
2214	int i = 0;
2215	int res = 0;
2216
2217	/* Treat all caps as no cap on error */
2218	*cpu_shares = ZS_LIMIT_NONE;
2219	*cpu_cap = ZS_LIMIT_NONE;
2220	*ram_cap = ZS_LIMIT_NONE;
2221	*locked_cap = ZS_LIMIT_NONE;
2222	*vm_cap = ZS_LIMIT_NONE;
2223
2224	*processes_cap = ZS_LIMIT_NONE;
2225	*lwps_cap = ZS_LIMIT_NONE;
2226	*shm_cap = ZS_LIMIT_NONE;
2227	*shmids_cap = ZS_LIMIT_NONE;
2228	*semids_cap = ZS_LIMIT_NONE;
2229	*msgids_cap = ZS_LIMIT_NONE;
2230	*lofi_cap = ZS_LIMIT_NONE;
2231
2232	*processes = 0;
2233	*lwps = 0;
2234	*shm = 0;
2235	*shmids = 0;
2236	*semids = 0;
2237	*msgids = 0;
2238	*lofi = 0;
2239
2240	/* Get the ram cap first since it is a zone attr */
2241	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2242	    ram_cap, sizeof (*ram_cap));
2243	if (ret < 0 || *ram_cap == 0)
2244		*ram_cap = ZS_LIMIT_NONE;
2245
2246	/* Get the zone's default scheduling class */
2247	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2248	    class, sizeof (class));
2249	if (ret < 0)
2250		return (-1);
2251
2252	*sched = zsd_schedname2int(class, 0);
2253
2254	/* rctl caps must be fetched from within the zone */
2255	if (pipe(p) != 0)
2256		return (-1);
2257
2258	if ((tmpl_fd = init_template()) == -1) {
2259		(void) close(p[0]);
2260		(void) close(p[1]);
2261		return (-1);
2262	}
2263	pid = forkx(0);
2264	if (pid < 0) {
2265		(void) ct_tmpl_clear(tmpl_fd);
2266		(void) close(p[0]);
2267		(void) close(p[1]);
2268		return (-1);
2269	}
2270	if (pid == 0) {
2271
2272		(void) ct_tmpl_clear(tmpl_fd);
2273		(void) close(tmpl_fd);
2274		(void) close(p[0]);
2275		if (zone->zsz_id != getzoneid()) {
2276			if (zone_enter(zone->zsz_id) < 0) {
2277				(void) close(p[1]);
2278				_exit(0);
2279			}
2280		}
2281
2282		/* Get caps for zone, and write them to zonestatd parent. */
2283		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2284		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2285		vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2286		vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2287		vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2288		vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2289		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2290		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2291		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2292		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2293		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2294		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2295		vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2296		vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2297		vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2298		vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2299		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2300		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2301
2302		if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2303		    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2304			(void) close(p[1]);
2305			_exit(1);
2306		}
2307
2308		(void) close(p[1]);
2309		_exit(0);
2310	}
2311	if (contract_latest(&ct) == -1)
2312		ct = -1;
2313
2314	(void) ct_tmpl_clear(tmpl_fd);
2315	(void) close(tmpl_fd);
2316	(void) close(p[1]);
2317	while (waitpid(pid, NULL, 0) != pid)
2318		;
2319
2320	/* Read cap from child in zone */
2321	if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2322	    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2323		res = -1;
2324		goto cleanup;
2325	}
2326	i = 0;
2327	*cpu_shares = vals[i++];
2328	*cpu_cap = vals[i++];
2329	*locked_cap = vals[i++];
2330	*vm_cap = vals[i++];
2331	*processes_cap = vals[i++];
2332	*processes = vals[i++];
2333	*lwps_cap = vals[i++];
2334	*lwps = vals[i++];
2335	*shm_cap = vals[i++];
2336	*shm = vals[i++];
2337	*shmids_cap = vals[i++];
2338	*shmids = vals[i++];
2339	*semids_cap = vals[i++];
2340	*semids = vals[i++];
2341	*msgids_cap = vals[i++];
2342	*msgids = vals[i++];
2343	*lofi_cap = vals[i++];
2344	*lofi = vals[i++];
2345
2346	/* Interpret maximum values as no cap */
2347	if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2348		*cpu_cap = ZS_LIMIT_NONE;
2349	if (*processes_cap == sys->zss_processes_max)
2350		*processes_cap = ZS_LIMIT_NONE;
2351	if (*lwps_cap == sys->zss_lwps_max)
2352		*lwps_cap = ZS_LIMIT_NONE;
2353	if (*shm_cap == sys->zss_shm_max)
2354		*shm_cap = ZS_LIMIT_NONE;
2355	if (*shmids_cap == sys->zss_shmids_max)
2356		*shmids_cap = ZS_LIMIT_NONE;
2357	if (*semids_cap == sys->zss_semids_max)
2358		*semids_cap = ZS_LIMIT_NONE;
2359	if (*msgids_cap == sys->zss_msgids_max)
2360		*msgids_cap = ZS_LIMIT_NONE;
2361	if (*lofi_cap == sys->zss_lofi_max)
2362		*lofi_cap = ZS_LIMIT_NONE;
2363
2364
2365cleanup:
2366	(void) close(p[0]);
2367	(void) ct_tmpl_clear(tmpl_fd);
2368	(void) close(tmpl_fd);
2369	(void) contract_abandon_id(ct);
2370
2371	return (res);
2372}
2373
2374/* Update the current list of running zones */
2375static void
2376zsd_refresh_zones(zsd_ctl_t *ctl)
2377{
2378	zsd_zone_t *zone;
2379	uint_t old, num;
2380	ushort_t flags;
2381	int i, ret;
2382	zoneid_t *cache;
2383	uint64_t cpu_shares;
2384	uint64_t cpu_cap;
2385	uint64_t ram_cap;
2386	uint64_t locked_cap;
2387	uint64_t vm_cap;
2388	uint64_t processes_cap;
2389	uint64_t processes;
2390	uint64_t lwps_cap;
2391	uint64_t lwps;
2392	uint64_t shm_cap;
2393	uint64_t shm;
2394	uint64_t shmids_cap;
2395	uint64_t shmids;
2396	uint64_t semids_cap;
2397	uint64_t semids;
2398	uint64_t msgids_cap;
2399	uint64_t msgids;
2400	uint64_t lofi_cap;
2401	uint64_t lofi;
2402
2403	char zonename[ZS_ZONENAME_MAX];
2404	char poolname[ZS_POOLNAME_MAX];
2405	char psetname[ZS_PSETNAME_MAX];
2406	uint_t sched;
2407	uint_t cputype;
2408	uint_t iptype;
2409
2410	/* Get the current list of running zones */
2411	for (;;) {
2412		old = num = ctl->zsctl_zone_ncache;
2413		(void) zone_list(ctl->zsctl_zone_cache, &num);
2414		if (num <= old)
2415			break;
2416		if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2417		    (num) * sizeof (zoneid_t))) != NULL) {
2418			ctl->zsctl_zone_ncache = num;
2419			ctl->zsctl_zone_cache = cache;
2420		} else {
2421			/* Could not allocate to get new zone list.  Give up */
2422			return;
2423		}
2424	}
2425
2426	zsd_mark_zones_start(ctl);
2427
2428	for (i = 0; i < num; i++) {
2429
2430		ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2431		    zonename, sizeof (zonename));
2432		if (ret < 0)
2433			continue;
2434
2435		zone = zsd_lookup_insert_zone(ctl, zonename,
2436		    ctl->zsctl_zone_cache[i]);
2437
2438		ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2439		    &flags, sizeof (flags));
2440		if (ret < 0)
2441			continue;
2442
2443		if (flags & ZF_NET_EXCL)
2444			iptype = ZS_IPTYPE_EXCLUSIVE;
2445		else
2446			iptype = ZS_IPTYPE_SHARED;
2447
2448		zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2449		    psetname, sizeof (psetname), &cputype);
2450
2451		if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2452		    &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2453		    &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2454		    &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2455		    &lofi, &sched) != 0)
2456			continue;
2457
2458		zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2459		    locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2460		    lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2461		    semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2462		    psetname, sched, cputype, iptype);
2463	}
2464}
2465
2466/* Fetch the details of a process from its psinfo_t */
2467static void
2468zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2469    psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2470    timestruc_t *delta, uint_t *sched)
2471{
2472	timestruc_t d;
2473	zsd_proc_t *proc;
2474
2475	/* Get cached data for proc */
2476	proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2477	*psetid = psinfo->pr_lwp.pr_bindpset;
2478
2479	if (proc->zspr_psetid == ZS_PSET_ERROR)
2480		*prev_psetid = *psetid;
2481	else
2482		*prev_psetid = proc->zspr_psetid;
2483
2484	*zoneid = psinfo->pr_zoneid;
2485	if (proc->zspr_zoneid == -1)
2486		*prev_zoneid = *zoneid;
2487	else
2488		*prev_zoneid = proc->zspr_zoneid;
2489
2490	TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2491	*delta = d;
2492
2493	*sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2494	    psinfo->pr_lwp.pr_pri);
2495
2496	/* Update cached data for proc */
2497	proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2498	proc->zspr_zoneid = psinfo->pr_zoneid;
2499	proc->zspr_sched = *sched;
2500	proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2501	proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2502	proc->zspr_ppid = psinfo->pr_ppid;
2503}
2504
2505/*
2506 * Reset the known cpu usage of a process. This is done after a process
2507 * exits so that if the pid is recycled, data from its previous life is
2508 * not reused
2509 */
2510static void
2511zsd_flush_proc_info(zsd_proc_t *proc)
2512{
2513	proc->zspr_usage.tv_sec = 0;
2514	proc->zspr_usage.tv_nsec = 0;
2515}
2516
2517/*
2518 * Open the current extended accounting file.  On initialization, open the
2519 * file as the current file to be used.  Otherwise, open the file as the
2520 * next file to use of the current file reaches EOF.
2521 */
2522static int
2523zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2524{
2525	int ret, oret, state, trys = 0, flags;
2526	int *fd, *open;
2527	ea_file_t *eaf;
2528	struct stat64 *stat;
2529	char path[MAXPATHLEN];
2530
2531	/*
2532	 * The accounting file is first opened at the tail.  Following
2533	 * opens to new accounting files are opened at the head.
2534	 */
2535	if (init == B_TRUE) {
2536		flags = EO_NO_VALID_HDR | EO_TAIL;
2537		fd = &ctl->zsctl_proc_fd;
2538		eaf = &ctl->zsctl_proc_eaf;
2539		stat = &ctl->zsctl_proc_stat;
2540		open = &ctl->zsctl_proc_open;
2541	} else {
2542		flags = EO_NO_VALID_HDR | EO_HEAD;
2543		fd = &ctl->zsctl_proc_fd_next;
2544		eaf = &ctl->zsctl_proc_eaf_next;
2545		stat = &ctl->zsctl_proc_stat_next;
2546		open = &ctl->zsctl_proc_open_next;
2547	}
2548
2549	*fd = -1;
2550	*open = 0;
2551retry:
2552	/* open accounting files for cpu consumption */
2553	ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2554	if (ret != 0) {
2555		zsd_warn(gettext("Unable to get process accounting state"));
2556		goto err;
2557	}
2558	if (state != AC_ON) {
2559		if (trys > 0) {
2560			zsd_warn(gettext(
2561			    "Unable to enable process accounting"));
2562			goto err;
2563		}
2564		(void) zsd_enable_cpu_stats();
2565		trys++;
2566		goto retry;
2567	}
2568
2569	ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2570	if (ret != 0) {
2571		zsd_warn(gettext("Unable to get process accounting file"));
2572		goto err;
2573	}
2574
2575	if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2576	    (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2577		ret = fstat64(*fd, stat);
2578
2579	if (*fd < 0 || oret < 0 || ret < 0) {
2580		struct timespec ts;
2581
2582		/*
2583		 * It is possible the accounting file is momentarily unavailable
2584		 * because it is being rolled.  Try for up to half a second.
2585		 *
2586		 * If failure to open accounting file persists, give up.
2587		 */
2588		if (oret == 0)
2589			(void) ea_close(eaf);
2590		else if (*fd >= 0)
2591			(void) close(*fd);
2592		if (trys > 500) {
2593			zsd_warn(gettext(
2594			    "Unable to open process accounting file"));
2595			goto err;
2596		}
2597		/* wait one millisecond */
2598		ts.tv_sec = 0;
2599		ts.tv_nsec = NANOSEC / 1000;
2600		(void) nanosleep(&ts, NULL);
2601		goto retry;
2602	}
2603	*open = 1;
2604	return (0);
2605err:
2606	if (*fd >= 0)
2607		(void) close(*fd);
2608	*open = 0;
2609	*fd = -1;
2610	return (-1);
2611}
2612
2613/*
2614 * Walk /proc and charge each process to its zone and processor set.
2615 * Then read exacct data for exited processes, and charge them as well.
2616 */
2617static void
2618zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2619{
2620	DIR *dir;
2621	struct dirent *dent;
2622	psinfo_t psinfo;
2623	int fd, ret;
2624	zsd_proc_t *proc, *pproc, *tmp, *next;
2625	list_t pplist, plist;
2626	zsd_zone_t *zone, *prev_zone;
2627	zsd_pset_t *pset, *prev_pset;
2628	psetid_t psetid, prev_psetid;
2629	zoneid_t zoneid, prev_zoneid;
2630	zsd_pset_usage_t *usage, *prev_usage;
2631	char path[MAXPATHLEN];
2632
2633	ea_object_t object;
2634	ea_object_t pobject;
2635	boolean_t hrtime_expired = B_FALSE;
2636	struct timeval interval_end;
2637
2638	timestruc_t delta, d1, d2;
2639	uint_t sched = 0;
2640
2641	/*
2642	 * Get the current accounting file.  The current accounting file
2643	 * may be different than the file in use, as the accounting file
2644	 * may have been rolled, or manually changed by an admin.
2645	 */
2646	ret = zsd_open_exacct(ctl, init);
2647	if (ret != 0) {
2648		zsd_warn(gettext("Unable to track process accounting"));
2649		return;
2650	}
2651
2652	/*
2653	 * Mark the current time as the interval end time.  Don't track
2654	 * processes that exit after this time.
2655	 */
2656	(void) gettimeofday(&interval_end, NULL);
2657
2658	dir = opendir("/proc");
2659	if (dir == NULL) {
2660		zsd_warn(gettext("Unable to open /proc"));
2661		return;
2662	}
2663
2664	dent = ctl->zsctl_procfs_dent;
2665
2666	(void) memset(dent, 0, ctl->zsctl_procfs_dent_size);
2667
2668	/* Walk all processes and compute each zone's usage on each pset. */
2669	while (readdir_r(dir, dent) != 0) {
2670
2671		if (strcmp(dent->d_name, ".") == 0 ||
2672		    strcmp(dent->d_name, "..") == 0)
2673			continue;
2674
2675		(void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2676		    dent->d_name);
2677
2678		fd = open(path, O_RDONLY);
2679		if (fd < 0)
2680			continue;
2681
2682		if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2683			(void) close(fd);
2684			continue;
2685		}
2686		(void) close(fd);
2687
2688		zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2689		    &zoneid, &prev_zoneid, &delta, &sched);
2690
2691		d1.tv_sec = delta.tv_sec / 2;
2692		d1.tv_nsec = delta.tv_nsec / 2;
2693		d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2694		d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2695
2696		/* Get the zone and pset this process is running in */
2697		zone = zsd_lookup_zone_byid(ctl, zoneid);
2698		if (zone == NULL)
2699			continue;
2700		pset = zsd_lookup_pset_byid(ctl, psetid);
2701		if (pset == NULL)
2702			continue;
2703		usage = zsd_lookup_insert_usage(ctl, pset, zone);
2704		if (usage == NULL)
2705			continue;
2706
2707		/*
2708		 * Get the usage of the previous zone and pset if they were
2709		 * different.
2710		 */
2711		if (zoneid != prev_zoneid)
2712			prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2713		else
2714			prev_zone = NULL;
2715
2716		if (psetid != prev_psetid)
2717			prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2718		else
2719			prev_pset = NULL;
2720
2721		prev_usage = NULL;
2722		if (prev_zone != NULL || prev_pset != NULL) {
2723			if (prev_zone == NULL)
2724				prev_zone = zone;
2725			if (prev_pset == NULL)
2726				prev_pset = pset;
2727
2728			prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2729			    prev_zone);
2730		}
2731
2732		/* Update the usage with the processes info */
2733		if (prev_usage == NULL) {
2734			zsd_mark_pset_usage_found(usage, sched);
2735		} else {
2736			zsd_mark_pset_usage_found(usage, sched);
2737			zsd_mark_pset_usage_found(prev_usage, sched);
2738		}
2739
2740		/*
2741		 * First time around is just to get a starting point.  All
2742		 * usages will be zero.
2743		 */
2744		if (init == B_TRUE)
2745			continue;
2746
2747		if (prev_usage == NULL) {
2748			zsd_add_usage(ctl, usage, &delta);
2749		} else {
2750			zsd_add_usage(ctl, usage, &d1);
2751			zsd_add_usage(ctl, prev_usage, &d2);
2752		}
2753	}
2754	(void) closedir(dir);
2755
2756	/*
2757	 * No need to collect exited proc data on initialization.  Just
2758	 * caching the usage of the known processes to get a zero starting
2759	 * point.
2760	 */
2761	if (init == B_TRUE)
2762		return;
2763
2764	/*
2765	 * Add accounting records to account for processes which have
2766	 * exited.
2767	 */
2768	list_create(&plist, sizeof (zsd_proc_t),
2769	    offsetof(zsd_proc_t, zspr_next));
2770	list_create(&pplist, sizeof (zsd_proc_t),
2771	    offsetof(zsd_proc_t, zspr_next));
2772
2773	for (;;) {
2774		pid_t pid;
2775		pid_t ppid;
2776		timestruc_t user, sys, proc_usage;
2777		timestruc_t finish;
2778		int numfound = 0;
2779
2780		bzero(&object, sizeof (object));
2781		proc = NULL;
2782		zone = NULL;
2783		pset = NULL;
2784		usage = NULL;
2785		ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2786		if (ret == EO_ERROR) {
2787			if (ea_error() == EXR_EOF) {
2788
2789				struct stat64 *stat;
2790				struct stat64 *stat_next;
2791
2792				/*
2793				 * See if the next accounting file is the
2794				 * same as the current accounting file.
2795				 */
2796				stat = &(ctl->zsctl_proc_stat);
2797				stat_next = &(ctl->zsctl_proc_stat_next);
2798				if (stat->st_ino == stat_next->st_ino &&
2799				    stat->st_dev == stat_next->st_dev) {
2800					/*
2801					 * End of current accounting file is
2802					 * reached, so finished.  Clear EOF
2803					 * bit for next time around.
2804					 */
2805					ea_clear(&ctl->zsctl_proc_eaf);
2806					break;
2807				} else {
2808					/*
2809					 * Accounting file has changed.  Move
2810					 * to current accounting file.
2811					 */
2812					(void) ea_close(&ctl->zsctl_proc_eaf);
2813
2814					ctl->zsctl_proc_fd =
2815					    ctl->zsctl_proc_fd_next;
2816					ctl->zsctl_proc_eaf =
2817					    ctl->zsctl_proc_eaf_next;
2818					ctl->zsctl_proc_stat =
2819					    ctl->zsctl_proc_stat_next;
2820
2821					ctl->zsctl_proc_fd_next = -1;
2822					ctl->zsctl_proc_open_next = 0;
2823					continue;
2824				}
2825			} else {
2826				/*
2827				 * Other accounting error.  Give up on
2828				 * accounting.
2829				 */
2830				goto ea_err;
2831			}
2832		}
2833		/* Skip if not a process group */
2834		if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2835		    (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2836			(void) ea_free_item(&object, EUP_ALLOC);
2837			continue;
2838		}
2839
2840		/* The process group entry should be complete */
2841		while (numfound < 9) {
2842			bzero(&pobject, sizeof (pobject));
2843			ret = ea_get_object(&ctl->zsctl_proc_eaf,
2844			    &pobject);
2845			if (ret < 0) {
2846				(void) ea_free_item(&object, EUP_ALLOC);
2847				zsd_warn(
2848				    "unable to get process accounting data");
2849				goto ea_err;
2850			}
2851			/* Next entries should be process data */
2852			if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2853			    EXT_GROUP) {
2854				(void) ea_free_item(&object, EUP_ALLOC);
2855				(void) ea_free_item(&pobject, EUP_ALLOC);
2856				zsd_warn(
2857				    "process data of wrong type");
2858				goto ea_err;
2859			}
2860			switch (pobject.eo_catalog & EXD_DATA_MASK) {
2861			case EXD_PROC_PID:
2862				pid = pobject.eo_item.ei_uint32;
2863				proc = &(ctl->zsctl_proc_array[pid]);
2864				/*
2865				 * This process should not be currently in
2866				 * the list of processes to process.
2867				 */
2868				assert(!list_link_active(&proc->zspr_next));
2869				numfound++;
2870				break;
2871			case EXD_PROC_ANCPID:
2872				ppid = pobject.eo_item.ei_uint32;
2873				pproc = &(ctl->zsctl_proc_array[ppid]);
2874				numfound++;
2875				break;
2876			case EXD_PROC_ZONENAME:
2877				zone = zsd_lookup_zone(ctl,
2878				    pobject.eo_item.ei_string, -1);
2879				numfound++;
2880				break;
2881			case EXD_PROC_CPU_USER_SEC:
2882				user.tv_sec =
2883				    pobject.eo_item.ei_uint64;
2884				numfound++;
2885				break;
2886			case EXD_PROC_CPU_USER_NSEC:
2887				user.tv_nsec =
2888				    pobject.eo_item.ei_uint64;
2889				numfound++;
2890				break;
2891			case EXD_PROC_CPU_SYS_SEC:
2892				sys.tv_sec =
2893				    pobject.eo_item.ei_uint64;
2894				numfound++;
2895				break;
2896			case EXD_PROC_CPU_SYS_NSEC:
2897				sys.tv_nsec =
2898				    pobject.eo_item.ei_uint64;
2899				numfound++;
2900				break;
2901			case EXD_PROC_FINISH_SEC:
2902				finish.tv_sec =
2903				    pobject.eo_item.ei_uint64;
2904				numfound++;
2905				break;
2906			case EXD_PROC_FINISH_NSEC:
2907				finish.tv_nsec =
2908				    pobject.eo_item.ei_uint64;
2909				numfound++;
2910				break;
2911			}
2912			(void) ea_free_item(&pobject, EUP_ALLOC);
2913		}
2914		(void) ea_free_item(&object, EUP_ALLOC);
2915		if (numfound != 9) {
2916			zsd_warn(gettext(
2917			    "Malformed process accounting entry found"));
2918			goto proc_done;
2919		}
2920
2921		if (finish.tv_sec > interval_end.tv_sec ||
2922		    (finish.tv_sec == interval_end.tv_sec &&
2923		    finish.tv_nsec > (interval_end.tv_usec * 1000)))
2924			hrtime_expired = B_TRUE;
2925
2926		/*
2927		 * Try to identify the zone and pset to which this
2928		 * exited process belongs.
2929		 */
2930		if (zone == NULL)
2931			goto proc_done;
2932
2933		/* Save proc info */
2934		proc->zspr_ppid = ppid;
2935		proc->zspr_zoneid = zone->zsz_id;
2936
2937		prev_psetid = ZS_PSET_ERROR;
2938		sched = 0;
2939
2940		/*
2941		 * The following tries to deduce the processes pset.
2942		 *
2943		 * First choose pset and sched using cached value from the
2944		 * most recent time the process has been seen.
2945		 *
2946		 * pset and sched can change across zone_enter, so make sure
2947		 * most recent sighting of this process was in the same
2948		 * zone before using most recent known value.
2949		 *
2950		 * If there is no known value, use value of processes
2951		 * parent.  If parent is unknown, walk parents until a known
2952		 * parent is found.
2953		 *
2954		 * If no parent in the zone is found, use the zone's default
2955		 * pset and scheduling class.
2956		 */
2957		if (proc->zspr_psetid != ZS_PSET_ERROR) {
2958			prev_psetid = proc->zspr_psetid;
2959			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2960			sched = proc->zspr_sched;
2961		} else if (pproc->zspr_zoneid == zone->zsz_id &&
2962		    pproc->zspr_psetid != ZS_PSET_ERROR) {
2963			prev_psetid = pproc->zspr_psetid;
2964			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2965			sched = pproc->zspr_sched;
2966		}
2967
2968		if (pset == NULL) {
2969			/*
2970			 * Process or processes parent has never been seen.
2971			 * Save to deduce a known parent later.
2972			 */
2973			proc_usage = sys;
2974			TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2975			TIMESTRUC_DELTA(delta, proc_usage,
2976			    proc->zspr_usage);
2977			proc->zspr_usage = delta;
2978			list_insert_tail(&plist, proc);
2979			continue;
2980		}
2981
2982		/* Add the zone's usage to the pset */
2983		usage = zsd_lookup_insert_usage(ctl, pset, zone);
2984		if (usage == NULL)
2985			goto proc_done;
2986
2987		zsd_mark_pset_usage_found(usage, sched);
2988
2989		/* compute the usage to add for the exited proc */
2990		proc_usage = sys;
2991		TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2992		TIMESTRUC_DELTA(delta, proc_usage,
2993		    proc->zspr_usage);
2994
2995		zsd_add_usage(ctl, usage, &delta);
2996proc_done:
2997		zsd_flush_proc_info(proc);
2998
2999		if (hrtime_expired == B_TRUE)
3000			break;
3001	}
3002	/*
3003	 * close next accounting file.
3004	 */
3005	if (ctl->zsctl_proc_open_next) {
3006		(void) ea_close(
3007		    &ctl->zsctl_proc_eaf_next);
3008		ctl->zsctl_proc_open_next = 0;
3009		ctl->zsctl_proc_fd_next = -1;
3010	}
3011
3012	/* For the remaining processes, use pset and sched of a known parent */
3013	proc = list_head(&plist);
3014	while (proc != NULL) {
3015		next = proc;
3016		for (;;) {
3017			if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3018				/*
3019				 * Kernel process, or parent is unknown, skip
3020				 * process, remove from process list.
3021				 */
3022				tmp = proc;
3023				proc = list_next(&plist, proc);
3024				list_link_init(&tmp->zspr_next);
3025				break;
3026			}
3027			pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3028			if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3029				/*
3030				 * Parent in different zone.  Save process and
3031				 * use zone's default pset and sched below
3032				 */
3033				tmp = proc;
3034				proc = list_next(&plist, proc);
3035				list_remove(&plist, tmp);
3036				list_insert_tail(&pplist, tmp);
3037				break;
3038			}
3039			/* Parent has unknown pset, Search parent's parent  */
3040			if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3041				next = pproc;
3042				continue;
3043			}
3044			/* Found parent with known pset.  Use its info */
3045			proc->zspr_psetid = pproc->zspr_psetid;
3046			proc->zspr_sched = pproc->zspr_sched;
3047			next->zspr_psetid = pproc->zspr_psetid;
3048			next->zspr_sched = pproc->zspr_sched;
3049			zone = zsd_lookup_zone_byid(ctl,
3050			    proc->zspr_zoneid);
3051			if (zone == NULL) {
3052				tmp = proc;
3053				proc = list_next(&plist, proc);
3054				list_remove(&plist, tmp);
3055				list_link_init(&tmp->zspr_next);
3056				break;
3057			}
3058			pset = zsd_lookup_pset_byid(ctl,
3059			    proc->zspr_psetid);
3060			if (pset == NULL) {
3061				tmp = proc;
3062				proc = list_next(&plist, proc);
3063				list_remove(&plist, tmp);
3064				list_link_init(&tmp->zspr_next);
3065				break;
3066			}
3067			/* Add the zone's usage to the pset */
3068			usage = zsd_lookup_insert_usage(ctl, pset, zone);
3069			if (usage == NULL) {
3070				tmp = proc;
3071				proc = list_next(&plist, proc);
3072				list_remove(&plist, tmp);
3073				list_link_init(&tmp->zspr_next);
3074				break;
3075			}
3076			zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3077			zsd_add_usage(ctl, usage, &proc->zspr_usage);
3078			zsd_flush_proc_info(proc);
3079			tmp = proc;
3080			proc = list_next(&plist, proc);
3081			list_remove(&plist, tmp);
3082			list_link_init(&tmp->zspr_next);
3083			break;
3084		}
3085	}
3086	/*
3087	 * Process has never been seen.  Using zone info to
3088	 * determine pset and scheduling class.
3089	 */
3090	proc = list_head(&pplist);
3091	while (proc != NULL) {
3092
3093		zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3094		if (zone == NULL)
3095			goto next;
3096		if (zone->zsz_psetid != ZS_PSET_ERROR &&
3097		    zone->zsz_psetid != ZS_PSET_MULTI) {
3098			prev_psetid = zone->zsz_psetid;
3099			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3100		} else {
3101			pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3102			if (pset != NULL)
3103				prev_psetid = pset->zsp_id;
3104		}
3105		if (pset == NULL)
3106			goto next;
3107
3108		sched = zone->zsz_scheds;
3109		/*
3110		 * Ignore FX high scheduling class if it is not the
3111		 * only scheduling class in the zone.
3112		 */
3113		if (sched != ZS_SCHED_FX_60)
3114			sched &= (~ZS_SCHED_FX_60);
3115		/*
3116		 * If more than one scheduling class has been found
3117		 * in the zone, use zone's default scheduling class for
3118		 * this process.
3119		 */
3120		if ((sched & (sched - 1)) != 0)
3121			sched = zone->zsz_default_sched;
3122
3123		/* Add the zone's usage to the pset */
3124		usage = zsd_lookup_insert_usage(ctl, pset, zone);
3125		if (usage == NULL)
3126			goto next;
3127
3128		zsd_mark_pset_usage_found(usage, sched);
3129		zsd_add_usage(ctl, usage, &proc->zspr_usage);
3130next:
3131		tmp = proc;
3132		proc = list_next(&pplist, proc);
3133		zsd_flush_proc_info(tmp);
3134		list_link_init(&tmp->zspr_next);
3135	}
3136	return;
3137ea_err:
3138	/*
3139	 * Close the next accounting file if we have not transitioned to it
3140	 * yet.
3141	 */
3142	if (ctl->zsctl_proc_open_next) {
3143		(void) ea_close(&ctl->zsctl_proc_eaf_next);
3144		ctl->zsctl_proc_open_next = 0;
3145		ctl->zsctl_proc_fd_next = -1;
3146	}
3147}
3148
3149/*
3150 * getvmusage(2) uses size_t's in the passwd data structure, which differ
3151 * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
3152 * and zonestatd does not necessarily match the kernel's bitness, marshal
3153 * results appropriately.
3154 */
3155static int
3156zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3157    uint64_t *nres)
3158{
3159	zsd_vmusage32_t *vmu32;
3160	zsd_vmusage64_t *vmu64;
3161	uint32_t nres32;
3162	int i;
3163	int ret;
3164
3165	if (ctl->zsctl_kern_bits == 32)  {
3166		nres32 = *nres;
3167		ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3168		    flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3169		*nres = nres32;
3170		if (ret == 0 && buf != NULL) {
3171			/*
3172			 * An array of vmusage32_t's has been returned.
3173			 * Convert it to an array of vmusage64_t's.
3174			 */
3175			vmu32 = (zsd_vmusage32_t *)buf;
3176			vmu64 = (zsd_vmusage64_t *)buf;
3177			for (i = nres32 - 1; i >= 0; i--) {
3178
3179				vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3180				vmu64[i].vmu_type = vmu32[i].vmu_type;
3181				vmu64[i].vmu_type = vmu32[i].vmu_type;
3182				vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3183				vmu64[i].vmu_rss_private =
3184				    vmu32[i].vmu_rss_private;
3185				vmu64[i].vmu_rss_shared =
3186				    vmu32[i].vmu_rss_shared;
3187				vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3188				vmu64[i].vmu_swap_private =
3189				    vmu32[i].vmu_swap_private;
3190				vmu64[i].vmu_swap_shared =
3191				    vmu32[i].vmu_swap_shared;
3192			}
3193		}
3194		return (ret);
3195	} else {
3196		/*
3197		 * kernel is 64 bit, so use 64 bit structures as zonestat
3198		 * expects.
3199		 */
3200		return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3201		    flags, age, (uintptr_t)buf, (uintptr_t)nres));
3202
3203	}
3204}
3205
3206/*
3207 * Update the current physical, virtual, and locked memory usage of the
3208 * running zones.
3209 */
3210static void
3211zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3212{
3213
3214	uint64_t phys_total;
3215	uint64_t phys_used;
3216	uint64_t phys_zones;
3217	uint64_t phys_zones_overcount;
3218	uint64_t phys_zones_extra;
3219	uint64_t phys_zones_credit;
3220
3221	uint64_t vm_free;
3222	uint64_t vm_used;
3223
3224	uint64_t disk_swap_total;
3225	uint64_t disk_swap_used;	/* disk swap with contents */
3226
3227	uint64_t physmem;
3228	uint64_t pp_kernel;
3229	uint64_t arc_size = 0;
3230	struct anoninfo ani;
3231
3232	int num_swap_devices;
3233	struct swaptable *swt;
3234	struct swapent *swent;
3235	size_t swt_size;
3236	char *path;
3237
3238	zsd_vmusage64_t *vmusage;
3239	uint64_t num_vmusage;
3240
3241	int i, ret;
3242
3243	zsd_system_t *sys;
3244	zsd_zone_t *zone;
3245	int vmu_nzones;
3246
3247	kstat_t *kstat;
3248	char kstat_name[KSTAT_STRLEN];
3249	kstat_named_t *knp;
3250	kid_t kid;
3251
3252	if (init)
3253		return;
3254
3255	sys = ctl->zsctl_system;
3256
3257	/* interrogate swap devices to find the amount of disk swap */
3258disk_swap_again:
3259	num_swap_devices = swapctl(SC_GETNSWP, NULL);
3260
3261	if (num_swap_devices == 0) {
3262		sys->zss_swap_total = disk_swap_total = 0;
3263		sys->zss_swap_used = disk_swap_used = 0;
3264		/* No disk swap */
3265		goto disk_swap_done;
3266	}
3267	/* see if swap table needs to be larger */
3268	if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3269		swt_size = sizeof (int) +
3270		    (num_swap_devices * sizeof (struct swapent)) +
3271		    (num_swap_devices * MAXPATHLEN);
3272		if (ctl->zsctl_swap_cache != NULL)
3273			free(ctl->zsctl_swap_cache);
3274
3275		swt = (struct swaptable *)malloc(swt_size);
3276		if (swt == NULL) {
3277			/*
3278			 * Could not allocate to get list of swap devices.
3279			 * Just use data from the most recent read, which will
3280			 * be zero if this is the first read.
3281			 */
3282			zsd_warn(gettext("Unable to allocate to determine "
3283			    "virtual memory"));
3284			disk_swap_total = sys->zss_swap_total;
3285			disk_swap_used = sys->zss_swap_used;
3286			goto disk_swap_done;
3287		}
3288		swent = swt->swt_ent;
3289		path = (char *)swt + (sizeof (int) +
3290		    num_swap_devices * sizeof (swapent_t));
3291		for (i = 0; i < num_swap_devices; i++, swent++) {
3292			swent->ste_path = path;
3293			path += MAXPATHLEN;
3294		}
3295		swt->swt_n = num_swap_devices;
3296		ctl->zsctl_swap_cache = swt;
3297		ctl->zsctl_swap_cache_size = swt_size;
3298		ctl->zsctl_swap_cache_num = num_swap_devices;
3299	}
3300	num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3301	if (num_swap_devices < 0) {
3302		/* More swap devices have arrived */
3303		if (errno == ENOMEM)
3304			goto disk_swap_again;
3305
3306		zsd_warn(gettext("Unable to determine disk swap devices"));
3307		/* Unexpected error.  Use existing data */
3308		disk_swap_total = sys->zss_swap_total;
3309		disk_swap_used = sys->zss_swap_used;
3310		goto disk_swap_done;
3311	}
3312
3313	/* add up the disk swap */
3314	disk_swap_total = 0;
3315	disk_swap_used = 0;
3316	swent = ctl->zsctl_swap_cache->swt_ent;
3317	for (i = 0; i < num_swap_devices; i++, swent++) {
3318		disk_swap_total += swent->ste_pages;
3319		disk_swap_used += (swent->ste_pages - swent->ste_free);
3320	}
3321	disk_swap_total *= ctl->zsctl_pagesize;
3322	disk_swap_used *= ctl->zsctl_pagesize;
3323
3324	sys->zss_swap_total = disk_swap_total;
3325	sys->zss_swap_used = disk_swap_used;
3326
3327disk_swap_done:
3328
3329	/* get system pages kstat */
3330	kid = -1;
3331	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3332	if (kstat == NULL)
3333		zsd_warn(gettext("Unable to lookup system pages kstat"));
3334	else
3335		kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3336
3337	if (kid == -1) {
3338		zsd_warn(gettext("Unable to read system pages kstat"));
3339		return;
3340	} else {
3341		knp = kstat_data_lookup(kstat, "physmem");
3342		if (knp == NULL) {
3343			zsd_warn(gettext("Unable to read physmem"));
3344		} else {
3345			if (knp->data_type == KSTAT_DATA_UINT64)
3346				physmem = knp->value.ui64;
3347			else if (knp->data_type == KSTAT_DATA_UINT32)
3348				physmem = knp->value.ui32;
3349			else
3350				return;
3351		}
3352		knp = kstat_data_lookup(kstat, "pp_kernel");
3353		if (knp == NULL) {
3354			zsd_warn(gettext("Unable to read pp_kernel"));
3355		} else {
3356			if (knp->data_type == KSTAT_DATA_UINT64)
3357				pp_kernel = knp->value.ui64;
3358			else if (knp->data_type == KSTAT_DATA_UINT32)
3359				pp_kernel = knp->value.ui32;
3360			else
3361				return;
3362		}
3363	}
3364	physmem *= ctl->zsctl_pagesize;
3365	pp_kernel *= ctl->zsctl_pagesize;
3366
3367	/* get the zfs arc size if available */
3368	arc_size = 0;
3369	kid = -1;
3370	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3371	if (kstat != NULL)
3372		kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3373	if (kid != -1) {
3374		knp = kstat_data_lookup(kstat, "size");
3375		if (knp != NULL)
3376			if (knp->data_type == KSTAT_DATA_UINT64)
3377				arc_size = knp->value.ui64;
3378	}
3379
3380	/* Try to get swap information */
3381	if (swapctl(SC_AINFO, &ani) < 0) {
3382		zsd_warn(gettext("Unable to get swap info"));
3383		return;
3384	}
3385
3386vmusage_again:
3387	/* getvmusage to get physical memory usage */
3388	vmusage = ctl->zsctl_vmusage_cache;
3389	num_vmusage = ctl->zsctl_vmusage_cache_num;
3390
3391	ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3392	    vmusage, &num_vmusage);
3393
3394	if (ret != 0) {
3395		/* Unexpected error.  Use existing data */
3396		if (errno != EOVERFLOW) {
3397			zsd_warn(gettext(
3398			    "Unable to read physical memory usage"));
3399			phys_zones = sys->zss_ram_zones;
3400			goto vmusage_done;
3401		}
3402	}
3403	/* vmusage results cache too small */
3404	if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3405
3406		size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3407
3408		if (ctl->zsctl_vmusage_cache != NULL)
3409			free(ctl->zsctl_vmusage_cache);
3410		vmusage = (zsd_vmusage64_t *)malloc(size);
3411		if (vmusage == NULL) {
3412			zsd_warn(gettext("Unable to alloc to determine "
3413			    "physical memory usage"));
3414			phys_zones = sys->zss_ram_zones;
3415			goto vmusage_done;
3416		}
3417		ctl->zsctl_vmusage_cache = vmusage;
3418		ctl->zsctl_vmusage_cache_num = num_vmusage;
3419		goto vmusage_again;
3420	}
3421
3422	phys_zones_overcount = 0;
3423	vmu_nzones = 0;
3424	for (i = 0; i < num_vmusage; i++) {
3425		switch (vmusage[i].vmu_type) {
3426		case VMUSAGE_SYSTEM:
3427			/* total pages backing user process mappings */
3428			phys_zones = sys->zss_ram_zones =
3429			    vmusage[i].vmu_rss_all;
3430			break;
3431		case VMUSAGE_ZONE:
3432			vmu_nzones++;
3433			phys_zones_overcount += vmusage[i].vmu_rss_all;
3434			zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3435			if (zone != NULL)
3436				zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3437			break;
3438		default:
3439			break;
3440		}
3441	}
3442	/*
3443	 * Figure how much memory was double counted due to text sharing
3444	 * between zones.  Credit this back so that the sum of the zones
3445	 * equals the total zone ram usage;
3446	 */
3447	phys_zones_extra = phys_zones_overcount - phys_zones;
3448	phys_zones_credit = phys_zones_extra / vmu_nzones;
3449
3450vmusage_done:
3451
3452	/* walk the zones to get swap and locked kstats.  Fetch ram cap. */
3453	sys->zss_locked_zones = 0;
3454	sys->zss_vm_zones = 0;
3455	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3456	    zone = list_next(&ctl->zsctl_zones, zone)) {
3457
3458		/* If zone halted during interval, show memory usage as none */
3459		if (zone->zsz_active == B_FALSE ||
3460		    zone->zsz_deleted == B_TRUE) {
3461			zone->zsz_usage_ram = 0;
3462			zone->zsz_usage_vm = 0;
3463			zone->zsz_usage_locked = 0;
3464			continue;
3465		}
3466
3467		if (phys_zones_credit > 0) {
3468			if (zone->zsz_usage_ram > phys_zones_credit) {
3469				zone->zsz_usage_ram -= phys_zones_credit;
3470			}
3471		}
3472		/*
3473		 * Get zone's swap usage.  Since zone could have halted,
3474		 * treats as zero if cannot read
3475		 */
3476		zone->zsz_usage_vm = 0;
3477		(void) snprintf(kstat_name, sizeof (kstat_name),
3478		    "swapresv_zone_%d", zone->zsz_id);
3479		kid = -1;
3480		kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3481		    zone->zsz_id, kstat_name);
3482		if (kstat != NULL)
3483			kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3484		if (kid != -1) {
3485			knp = kstat_data_lookup(kstat, "usage");
3486			if (knp != NULL &&
3487			    knp->data_type == KSTAT_DATA_UINT64) {
3488				zone->zsz_usage_vm = knp->value.ui64;
3489				sys->zss_vm_zones += knp->value.ui64;
3490			}
3491		}
3492		/*
3493		 * Get zone's locked usage.  Since zone could have halted,
3494		 * treats as zero if cannot read
3495		 */
3496		zone->zsz_usage_locked = 0;
3497		(void) snprintf(kstat_name, sizeof (kstat_name),
3498		    "lockedmem_zone_%d", zone->zsz_id);
3499		kid = -1;
3500		kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3501		    zone->zsz_id, kstat_name);
3502		if (kstat != NULL)
3503			kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3504		if (kid != -1) {
3505			knp = kstat_data_lookup(kstat, "usage");
3506			if (knp != NULL &&
3507			    knp->data_type == KSTAT_DATA_UINT64) {
3508				zone->zsz_usage_locked = knp->value.ui64;
3509				/*
3510				 * Since locked memory accounting for zones
3511				 * can double count ddi locked memory, cap each
3512				 * zone's locked usage at its ram usage.
3513				 */
3514				if (zone->zsz_usage_locked >
3515				    zone->zsz_usage_ram)
3516					zone->zsz_usage_locked =
3517					    zone->zsz_usage_ram;
3518				sys->zss_locked_zones +=
3519				    zone->zsz_usage_locked;
3520			}
3521		}
3522	}
3523
3524	phys_total =
3525	    sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3526
3527	phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3528	    * ctl->zsctl_pagesize;
3529
3530	/* Compute remaining statistics */
3531	sys->zss_ram_total = phys_total;
3532	sys->zss_ram_zones = phys_zones;
3533	sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3534
3535	/*
3536	 * The total for kernel locked memory should include
3537	 * segkp locked pages, but oh well.  The arc size is subtracted,
3538	 * as that physical memory is reclaimable.
3539	 */
3540	sys->zss_locked_kern = pp_kernel - arc_size;
3541	/* Add memory used by kernel startup and obp to kernel locked */
3542	if ((phys_total - physmem) > 0)
3543		sys->zss_locked_kern += phys_total - physmem;
3544
3545	/*
3546	 * Add in the portion of (RAM+DISK) that is not available as swap,
3547	 * and consider it swap used by the kernel.
3548	 */
3549	sys->zss_vm_total = phys_total + disk_swap_total;
3550	vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3551	vm_used = sys->zss_vm_total - vm_free;
3552	sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3553}
3554
3555/*
3556 * Charge each cpu's usage to its processor sets.  Also add the cpu's total
3557 * time to each zone using the processor set.  This tracks the maximum
3558 * amount of cpu time that a zone could have used.
3559 */
3560static void
3561zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3562{
3563	zsd_system_t *sys;
3564	zsd_zone_t *zone;
3565	zsd_pset_usage_t *usage;
3566	zsd_cpu_t *cpu;
3567	zsd_cpu_t *cpu_next;
3568	zsd_pset_t *pset;
3569	timestruc_t ts;
3570	uint64_t hrtime;
3571	timestruc_t delta;
3572
3573	/* Update the per-cpu kstat data */
3574	cpu_next = list_head(&ctl->zsctl_cpus);
3575	while (cpu_next != NULL) {
3576		cpu = cpu_next;
3577		cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3578		zsd_update_cpu_stats(ctl, cpu);
3579	}
3580	/* Update the elapsed real time */
3581	hrtime = gethrtime();
3582	if (init) {
3583		/* first time around, store hrtime for future comparision */
3584		ctl->zsctl_hrtime = hrtime;
3585		ctl->zsctl_hrtime_prev = hrtime;
3586
3587	} else {
3588		/* Compute increase in hrtime since the most recent read */
3589		ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3590		ctl->zsctl_hrtime = hrtime;
3591		if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3592			TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3593	}
3594
3595	/* On initialization, all psets have zero time  */
3596	if (init)
3597		return;
3598
3599	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3600	    pset = list_next(&ctl->zsctl_psets, pset)) {
3601
3602		if (pset->zsp_active == B_FALSE) {
3603			zsd_warn(gettext("Internal error,inactive pset found"));
3604			continue;
3605		}
3606
3607		/* sum total used time for pset */
3608		ts.tv_sec = 0;
3609		ts.tv_nsec = 0;
3610		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3611		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3612		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3613		/* kernel time in pset is total time minus zone time */
3614		TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3615		    pset->zsp_usage_zones);
3616		if (pset->zsp_usage_kern.tv_sec < 0 ||
3617		    pset->zsp_usage_kern.tv_nsec < 0) {
3618			pset->zsp_usage_kern.tv_sec = 0;
3619			pset->zsp_usage_kern.tv_nsec = 0;
3620		}
3621		/* Total pset elapsed time is used time plus idle time */
3622		TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3623
3624		TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3625
3626		for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3627		    usage = list_next(&pset->zsp_usage_list, usage)) {
3628
3629			zone = usage->zsu_zone;
3630			if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3631			    usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3632			    usage->zsu_cpu_shares != 0) {
3633				/*
3634				 * Figure out how many nanoseconds of share time
3635				 * to give to the zone
3636				 */
3637				hrtime = delta.tv_sec;
3638				hrtime *= NANOSEC;
3639				hrtime += delta.tv_nsec;
3640				hrtime *= usage->zsu_cpu_shares;
3641				hrtime /= pset->zsp_cpu_shares;
3642				TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3643				    hrtime);
3644			}
3645			/* Add pset time to each zone using pset */
3646			TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3647
3648			zone->zsz_cpus_online += pset->zsp_online;
3649		}
3650		pset->zsp_total_time = ts;
3651	}
3652
3653	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3654	    zone = list_next(&ctl->zsctl_zones, zone)) {
3655
3656		/* update cpu cap tracking if the zone has a cpu cap */
3657		if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3658			uint64_t elapsed;
3659
3660			elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3661			elapsed *= zone->zsz_cpu_cap;
3662			elapsed = elapsed / 100;
3663			TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3664		}
3665	}
3666	sys = ctl->zsctl_system;
3667	ts.tv_sec = 0;
3668	ts.tv_nsec = 0;
3669	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3670	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3671	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3672
3673	/* kernel time in pset is total time minus zone time */
3674	TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3675	    sys->zss_cpu_usage_zones);
3676	if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3677	    sys->zss_cpu_usage_kern.tv_nsec < 0) {
3678		sys->zss_cpu_usage_kern.tv_sec = 0;
3679		sys->zss_cpu_usage_kern.tv_nsec = 0;
3680	}
3681	/* Total pset elapsed time is used time plus idle time */
3682	TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3683	sys->zss_cpu_total_time = ts;
3684}
3685
3686/*
3687 * Saves current usage data to a cache that is read by libzonestat when
3688 * calling zs_usage_read().
3689 *
3690 * All pointers in the cached data structure are set to NULL.  When
3691 * libzonestat reads the cached data, it will set the pointers relative to
3692 * its address space.
3693 */
3694static void
3695zsd_usage_cache_update(zsd_ctl_t *ctl)
3696{
3697	zs_usage_cache_t *cache;
3698	zs_usage_cache_t *old;
3699	zs_usage_t *usage;
3700
3701	zs_system_t *sys;
3702	zsd_system_t *dsys;
3703	zs_zone_t *zone = NULL;
3704	zsd_zone_t *dzone;
3705	zs_pset_t *pset = NULL;
3706	zsd_pset_t *dpset;
3707	zs_pset_zone_t *pusage;
3708	zsd_pset_usage_t *dpusage;
3709
3710	char *next;
3711	uint_t size, i, j;
3712
3713	size =
3714	    sizeof (zs_usage_cache_t) +
3715	    sizeof (zs_usage_t) +
3716	    sizeof (zs_system_t) +
3717	    sizeof (zs_zone_t) * ctl->zsctl_nzones +
3718	    sizeof (zs_pset_t) *  ctl->zsctl_npsets +
3719	    sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3720
3721	cache = (zs_usage_cache_t *)malloc(size);
3722	if (cache == NULL) {
3723		zsd_warn(gettext("Unable to allocate usage cache\n"));
3724		return;
3725	}
3726
3727	next = (char *)cache;
3728	cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3729	next += sizeof (zs_usage_cache_t);
3730
3731	/* LINTED */
3732	usage = cache->zsuc_usage = (zs_usage_t *)next;
3733	next += sizeof (zs_usage_t);
3734	usage->zsu_start = g_start;
3735	usage->zsu_hrstart = g_hrstart;
3736	usage->zsu_time = g_now;
3737	usage->zsu_hrtime = g_hrnow;
3738	usage->zsu_nzones = ctl->zsctl_nzones;
3739	usage->zsu_npsets = ctl->zsctl_npsets;
3740	usage->zsu_system = NULL;
3741
3742	/* LINTED */
3743	sys = (zs_system_t *)next;
3744	next += sizeof (zs_system_t);
3745	dsys = ctl->zsctl_system;
3746	sys->zss_ram_total = dsys->zss_ram_total;
3747	sys->zss_ram_kern = dsys->zss_ram_kern;
3748	sys->zss_ram_zones = dsys->zss_ram_zones;
3749	sys->zss_locked_kern = dsys->zss_locked_kern;
3750	sys->zss_locked_zones = dsys->zss_locked_zones;
3751	sys->zss_vm_total = dsys->zss_vm_total;
3752	sys->zss_vm_kern = dsys->zss_vm_kern;
3753	sys->zss_vm_zones = dsys->zss_vm_zones;
3754	sys->zss_swap_total = dsys->zss_swap_total;
3755	sys->zss_swap_used = dsys->zss_swap_used;
3756	sys->zss_ncpus = dsys->zss_ncpus;
3757	sys->zss_ncpus_online = dsys->zss_ncpus_online;
3758
3759	sys->zss_processes_max = dsys->zss_maxpid;
3760	sys->zss_lwps_max = dsys->zss_lwps_max;
3761	sys->zss_shm_max = dsys->zss_shm_max;
3762	sys->zss_shmids_max = dsys->zss_shmids_max;
3763	sys->zss_semids_max = dsys->zss_semids_max;
3764	sys->zss_msgids_max = dsys->zss_msgids_max;
3765	sys->zss_lofi_max = dsys->zss_lofi_max;
3766
3767	sys->zss_processes = dsys->zss_processes;
3768	sys->zss_lwps = dsys->zss_lwps;
3769	sys->zss_shm = dsys->zss_shm;
3770	sys->zss_shmids = dsys->zss_shmids;
3771	sys->zss_semids = dsys->zss_semids;
3772	sys->zss_msgids = dsys->zss_msgids;
3773	sys->zss_lofi = dsys->zss_lofi;
3774
3775	sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3776	sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3777	sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3778
3779	for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3780	    i < ctl->zsctl_nzones;
3781	    i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3782		/* LINTED */
3783		zone = (zs_zone_t *)next;
3784		next += sizeof (zs_zone_t);
3785		list_link_init(&zone->zsz_next);
3786		zone->zsz_system = NULL;
3787
3788		(void) strlcpy(zone->zsz_name, dzone->zsz_name,
3789		    sizeof (zone->zsz_name));
3790		(void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3791		    sizeof (zone->zsz_pool));
3792		(void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3793		    sizeof (zone->zsz_pset));
3794		zone->zsz_id = dzone->zsz_id;
3795		zone->zsz_cputype = dzone->zsz_cputype;
3796		zone->zsz_iptype = dzone->zsz_iptype;
3797		zone->zsz_start = dzone->zsz_start;
3798		zone->zsz_hrstart = dzone->zsz_hrstart;
3799		zone->zsz_scheds = dzone->zsz_scheds;
3800		zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3801		zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3802		zone->zsz_ram_cap = dzone->zsz_ram_cap;
3803		zone->zsz_vm_cap = dzone->zsz_vm_cap;
3804		zone->zsz_locked_cap = dzone->zsz_locked_cap;
3805		zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3806		zone->zsz_cpus_online = dzone->zsz_cpus_online;
3807		zone->zsz_pset_time = dzone->zsz_pset_time;
3808		zone->zsz_cap_time = dzone->zsz_cap_time;
3809		zone->zsz_share_time = dzone->zsz_share_time;
3810		zone->zsz_usage_ram = dzone->zsz_usage_ram;
3811		zone->zsz_usage_locked = dzone->zsz_usage_locked;
3812		zone->zsz_usage_vm = dzone->zsz_usage_vm;
3813
3814		zone->zsz_processes_cap = dzone->zsz_processes_cap;
3815		zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3816		zone->zsz_shm_cap = dzone->zsz_shm_cap;
3817		zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3818		zone->zsz_semids_cap = dzone->zsz_semids_cap;
3819		zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3820		zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3821
3822		zone->zsz_processes = dzone->zsz_processes;
3823		zone->zsz_lwps = dzone->zsz_lwps;
3824		zone->zsz_shm = dzone->zsz_shm;
3825		zone->zsz_shmids = dzone->zsz_shmids;
3826		zone->zsz_semids = dzone->zsz_semids;
3827		zone->zsz_msgids = dzone->zsz_msgids;
3828		zone->zsz_lofi = dzone->zsz_lofi;
3829	}
3830
3831	for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3832	    i < ctl->zsctl_npsets;
3833	    i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3834		/* LINTED */
3835		pset = (zs_pset_t *)next;
3836		next += sizeof (zs_pset_t);
3837		list_link_init(&pset->zsp_next);
3838		(void) strlcpy(pset->zsp_name, dpset->zsp_name,
3839		    sizeof (pset->zsp_name));
3840		pset->zsp_id = dpset->zsp_id;
3841		pset->zsp_cputype = dpset->zsp_cputype;
3842		pset->zsp_start = dpset->zsp_start;
3843		pset->zsp_hrstart = dpset->zsp_hrstart;
3844		pset->zsp_online = dpset->zsp_online;
3845		pset->zsp_size = dpset->zsp_size;
3846		pset->zsp_min = dpset->zsp_min;
3847		pset->zsp_max = dpset->zsp_max;
3848		pset->zsp_importance = dpset->zsp_importance;
3849		pset->zsp_scheds = dpset->zsp_scheds;
3850		pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3851		pset->zsp_total_time = dpset->zsp_total_time;
3852		pset->zsp_usage_kern = dpset->zsp_usage_kern;
3853		pset->zsp_usage_zones = dpset->zsp_usage_zones;
3854		pset->zsp_nusage = dpset->zsp_nusage;
3855		/* Add pset usages for pset */
3856		for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3857		    j < dpset->zsp_nusage;
3858		    j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3859			/* LINTED */
3860			pusage = (zs_pset_zone_t *)next;
3861			next += sizeof (zs_pset_zone_t);
3862			/* pointers are computed by client */
3863			pusage->zspz_pset = NULL;
3864			pusage->zspz_zone = NULL;
3865			list_link_init(&pusage->zspz_next);
3866			pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3867			pusage->zspz_start = dpusage->zsu_start;
3868			pusage->zspz_hrstart = dpusage->zsu_hrstart;
3869			pusage->zspz_hrstart = dpusage->zsu_hrstart;
3870			pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3871			pusage->zspz_scheds = dpusage->zsu_scheds;
3872			pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3873		}
3874	}
3875
3876	/* Update the current cache pointer */
3877	(void) mutex_lock(&g_usage_cache_lock);
3878	old = g_usage_cache;
3879	cache->zsuc_ref = 1;
3880	cache->zsuc_gen = g_gen_next;
3881	usage->zsu_gen = g_gen_next;
3882	usage->zsu_size = size;
3883	g_usage_cache = cache;
3884	if (old != NULL) {
3885		old->zsuc_ref--;
3886		if (old->zsuc_ref == 0)
3887			free(old);
3888	}
3889	g_gen_next++;
3890	/* Wake up any clients that are waiting for this calculation */
3891	if (g_usage_cache_kickers > 0) {
3892		(void) cond_broadcast(&g_usage_cache_wait);
3893	}
3894	(void) mutex_unlock(&g_usage_cache_lock);
3895}
3896
3897static zs_usage_cache_t *
3898zsd_usage_cache_hold_locked()
3899{
3900	zs_usage_cache_t *ret;
3901
3902	ret = g_usage_cache;
3903	ret->zsuc_ref++;
3904	return (ret);
3905}
3906
3907void
3908zsd_usage_cache_rele(zs_usage_cache_t *cache)
3909{
3910	(void) mutex_lock(&g_usage_cache_lock);
3911	cache->zsuc_ref--;
3912	if (cache->zsuc_ref == 0)
3913		free(cache);
3914	(void) mutex_unlock(&g_usage_cache_lock);
3915}
3916
3917/* Close the handles held by zsd_open() */
3918void
3919zsd_close(zsd_ctl_t *ctl)
3920{
3921	zsd_zone_t *zone;
3922	zsd_pset_t *pset;
3923	zsd_pset_usage_t *usage;
3924	zsd_cpu_t *cpu;
3925	int id;
3926
3927	if (ctl->zsctl_kstat_ctl) {
3928		(void) kstat_close(ctl->zsctl_kstat_ctl);
3929		ctl->zsctl_kstat_ctl = NULL;
3930	}
3931	if (ctl->zsctl_proc_open) {
3932		(void) ea_close(&ctl->zsctl_proc_eaf);
3933		ctl->zsctl_proc_open = 0;
3934		ctl->zsctl_proc_fd = -1;
3935	}
3936	if (ctl->zsctl_pool_conf) {
3937		if (ctl->zsctl_pool_status == POOL_ENABLED)
3938			(void) pool_conf_close(ctl->zsctl_pool_conf);
3939		ctl->zsctl_pool_status = POOL_DISABLED;
3940	}
3941
3942	while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3943		list_remove(&ctl->zsctl_zones, zone);
3944		free(zone);
3945		ctl->zsctl_nzones--;
3946	}
3947
3948	while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3949		while ((usage = list_head(&pset->zsp_usage_list))
3950		    != NULL) {
3951			list_remove(&pset->zsp_usage_list, usage);
3952			ctl->zsctl_npset_usages--;
3953			free(usage);
3954		}
3955		list_remove(&ctl->zsctl_psets, pset);
3956		free(pset);
3957		ctl->zsctl_npsets--;
3958	}
3959
3960	/* Release all cpus being tracked */
3961	while (cpu = list_head(&ctl->zsctl_cpus)) {
3962		list_remove(&ctl->zsctl_cpus, cpu);
3963		id = cpu->zsc_id;
3964		bzero(cpu, sizeof (zsd_cpu_t));
3965		cpu->zsc_id = id;
3966		cpu->zsc_allocated = B_FALSE;
3967		cpu->zsc_psetid = ZS_PSET_ERROR;
3968		cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3969	}
3970
3971	assert(ctl->zsctl_npset_usages == 0);
3972	assert(ctl->zsctl_npsets == 0);
3973	assert(ctl->zsctl_nzones == 0);
3974	(void) zsd_disable_cpu_stats();
3975}
3976
3977
3978/*
3979 * Update the utilization data for all zones and processor sets.
3980 */
3981static int
3982zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3983{
3984	(void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3985	(void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3986
3987	zsd_refresh_system(ctl);
3988
3989	/*
3990	 * Memory calculation is expensive.  Only update it on sample
3991	 * intervals.
3992	 */
3993	if (do_memory == B_TRUE)
3994		zsd_refresh_memory(ctl, init);
3995	zsd_refresh_zones(ctl);
3996	zsd_refresh_psets(ctl);
3997	zsd_refresh_procs(ctl, init);
3998	zsd_refresh_cpu_stats(ctl, init);
3999
4000	/*
4001	 * Delete objects that no longer exist.
4002	 * Pset usages must be deleted first as they point to zone and
4003	 * pset objects.
4004	 */
4005	zsd_mark_pset_usages_end(ctl);
4006	zsd_mark_psets_end(ctl);
4007	zsd_mark_cpus_end(ctl);
4008	zsd_mark_zones_end(ctl);
4009
4010	/*
4011	 * Save results for clients.
4012	 */
4013	zsd_usage_cache_update(ctl);
4014
4015	/*
4016	 * Roll process accounting file.
4017	 */
4018	(void) zsd_roll_exacct();
4019	return (0);
4020}
4021
4022/*
4023 * Get the system rctl, which is the upper most limit
4024 */
4025static uint64_t
4026zsd_get_system_rctl(char *name)
4027{
4028	rctlblk_t *rblk, *rblk_last;
4029
4030	rblk = (rctlblk_t *)alloca(rctlblk_size());
4031	rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4032
4033	if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4034		return (ZS_LIMIT_NONE);
4035
4036	while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4037		(void) bcopy(rblk, rblk_last, rctlblk_size());
4038
4039	return (rctlblk_get_value(rblk_last));
4040}
4041
4042/*
4043 * Open any necessary subsystems for collecting utilization data,
4044 * allocate and initialize data structures, and get initial utilization.
4045 *
4046 * Errors:
4047 *	ENOMEM	out of memory
4048 *	EINVAL  other error
4049 */
4050static zsd_ctl_t *
4051zsd_open(zsd_ctl_t *ctl)
4052{
4053	zsd_system_t *system;
4054
4055	char path[MAXPATHLEN];
4056	long pathmax;
4057	struct statvfs svfs;
4058	int ret;
4059	int i;
4060	size_t size;
4061	int err;
4062
4063	if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4064	    sizeof (zsd_ctl_t))) == NULL) {
4065			zsd_warn(gettext("Out of Memory"));
4066			errno = ENOMEM;
4067			goto err;
4068	}
4069	ctl->zsctl_proc_fd = -1;
4070
4071	/* open kstats */
4072	if (ctl->zsctl_kstat_ctl == NULL &&
4073	    (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4074		err = errno;
4075		zsd_warn(gettext("Unable to open kstats"));
4076		errno = err;
4077		if (errno != ENOMEM)
4078			errno = EAGAIN;
4079		goto err;
4080	}
4081
4082	/*
4083	 * These are set when the accounting file is opened by
4084	 * zsd_update_procs()
4085	 */
4086	ctl->zsctl_proc_fd = -1;
4087	ctl->zsctl_proc_fd_next = -1;
4088	ctl->zsctl_proc_open = 0;
4089	ctl->zsctl_proc_open_next = 0;
4090
4091check_exacct:
4092	(void) zsd_enable_cpu_stats();
4093
4094	/* Create structures to track usage */
4095	if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4096	    calloc(1, sizeof (zsd_system_t))) == NULL) {
4097		ret = -1;
4098		zsd_warn(gettext("Out of Memory"));
4099		errno = ENOMEM;
4100		goto err;
4101	}
4102	system = ctl->zsctl_system;
4103	/* get the kernel bitness to know structure layout for getvmusage */
4104	ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4105	if (ret < 0)
4106		ctl->zsctl_kern_bits = 32;
4107	else
4108		ctl->zsctl_kern_bits = 64;
4109	ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4110
4111	size = sysconf(_SC_CPUID_MAX);
4112	ctl->zsctl_maxcpuid = size;
4113	if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4114	    (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4115		zsd_warn(gettext("Out of Memory"));
4116		errno = ENOMEM;
4117		goto err;
4118	}
4119	for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4120		ctl->zsctl_cpu_array[i].zsc_id = i;
4121		ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4122		ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4123		ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4124	}
4125	if (statvfs("/proc", &svfs) != 0 ||
4126	    strcmp("/proc", svfs.f_fstr) != 0) {
4127		zsd_warn(gettext("/proc not a procfs filesystem"));
4128		errno = EINVAL;
4129		goto err;
4130	}
4131
4132	size = sysconf(_SC_MAXPID) + 1;
4133	ctl->zsctl_maxproc = size;
4134	if (ctl->zsctl_proc_array == NULL &&
4135	    (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4136	    sizeof (zsd_proc_t))) == NULL) {
4137		zsd_warn(gettext("Out of Memory"));
4138		errno = ENOMEM;
4139		goto err;
4140	}
4141	for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4142		list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4143		ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4144		ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4145		ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4146		ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4147		ctl->zsctl_proc_array[i].zspr_ppid = -1;
4148	}
4149
4150	list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4151	    offsetof(zsd_zone_t, zsz_next));
4152
4153	list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4154	    offsetof(zsd_pset_t, zsp_next));
4155
4156	list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4157	    offsetof(zsd_cpu_t, zsc_next));
4158
4159	pathmax = pathconf("/proc", _PC_NAME_MAX);
4160	if (pathmax < 0) {
4161		zsd_warn(gettext("Unable to determine max path of /proc"));
4162		errno = EINVAL;
4163		goto err;
4164	}
4165	size = sizeof (struct dirent) + pathmax + 1;
4166
4167	ctl->zsctl_procfs_dent_size = size;
4168	if (ctl->zsctl_procfs_dent == NULL &&
4169	    (ctl->zsctl_procfs_dent = (struct dirent *)calloc(1, size))
4170	    == NULL) {
4171		zsd_warn(gettext("Out of Memory"));
4172		errno = ENOMEM;
4173		goto err;
4174	}
4175
4176	if (ctl->zsctl_pool_conf == NULL &&
4177	    (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4178		zsd_warn(gettext("Out of Memory"));
4179		errno = ENOMEM;
4180		goto err;
4181	}
4182	ctl->zsctl_pool_status = POOL_DISABLED;
4183	ctl->zsctl_pool_changed = 0;
4184
4185	if (ctl->zsctl_pool_vals[0] == NULL &&
4186	    (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4187		zsd_warn(gettext("Out of Memory"));
4188		errno = ENOMEM;
4189		goto err;
4190	}
4191	if (ctl->zsctl_pool_vals[1] == NULL &&
4192	    (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4193		zsd_warn(gettext("Out of Memory"));
4194		errno = ENOMEM;
4195		goto err;
4196	}
4197	ctl->zsctl_pool_vals[2] = NULL;
4198
4199	/*
4200	 * get system limits
4201	 */
4202	system->zss_maxpid = size = sysconf(_SC_MAXPID);
4203	system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4204	system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4205	system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4206	system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4207	system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4208	system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4209	system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4210
4211	g_gen_next = 1;
4212
4213	if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4214		zsd_warn(gettext("Reading zone statistics failed"));
4215
4216	return (ctl);
4217err:
4218	if (ctl)
4219		zsd_close(ctl);
4220
4221	return (NULL);
4222}
4223
4224/* Copy utilization data to buffer, filtering data if non-global zone. */
4225static void
4226zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4227    boolean_t is_gz)
4228{
4229	zs_usage_t *cusage;
4230	zs_system_t *sys, *csys;
4231	zs_zone_t *zone, *czone;
4232	zs_pset_t *pset, *cpset;
4233	zs_pset_zone_t *pz, *cpz, *foundpz;
4234	size_t size = 0, csize = 0;
4235	char *start, *cstart;
4236	int i, j;
4237	timestruc_t delta;
4238
4239	/* Privileged users in the global zone get everything */
4240	if (is_gz) {
4241		cusage = cache->zsuc_usage;
4242		(void) bcopy(cusage, usage, cusage->zsu_size);
4243		return;
4244	}
4245
4246	/* Zones just get their own usage */
4247	cusage = cache->zsuc_usage;
4248
4249	start = (char *)usage;
4250	cstart = (char *)cusage;
4251	size += sizeof (zs_usage_t);
4252	csize += sizeof (zs_usage_t);
4253
4254	usage->zsu_start = cusage->zsu_start;
4255	usage->zsu_hrstart = cusage->zsu_hrstart;
4256	usage->zsu_time = cusage->zsu_time;
4257	usage->zsu_hrtime = cusage->zsu_hrtime;
4258	usage->zsu_gen = cusage->zsu_gen;
4259	usage->zsu_nzones = 1;
4260	usage->zsu_npsets = 0;
4261
4262	/* LINTED */
4263	sys = (zs_system_t *)(start + size);
4264	/* LINTED */
4265	csys = (zs_system_t *)(cstart + csize);
4266	size += sizeof (zs_system_t);
4267	csize += sizeof (zs_system_t);
4268
4269	/* Save system limits but not usage */
4270	*sys = *csys;
4271	sys->zss_ncpus = 0;
4272	sys->zss_ncpus_online = 0;
4273
4274	/* LINTED */
4275	zone = (zs_zone_t *)(start + size);
4276	/* LINTED */
4277	czone = (zs_zone_t *)(cstart + csize);
4278	/* Find the matching zone */
4279	for (i = 0; i < cusage->zsu_nzones; i++) {
4280		if (czone->zsz_id == zid) {
4281			*zone = *czone;
4282			size += sizeof (zs_zone_t);
4283		}
4284		csize += sizeof (zs_zone_t);
4285		/* LINTED */
4286		czone = (zs_zone_t *)(cstart + csize);
4287	}
4288	sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4289	sys->zss_ram_zones = zone->zsz_usage_ram;
4290
4291	sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4292	sys->zss_vm_zones = zone->zsz_usage_vm;
4293
4294	sys->zss_locked_kern += (sys->zss_locked_zones -
4295	    zone->zsz_usage_locked);
4296	sys->zss_locked_zones = zone->zsz_usage_locked;
4297
4298	TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4299	TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4300	sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4301
4302	/* LINTED */
4303	pset = (zs_pset_t *)(start + size);
4304	/* LINTED */
4305	cpset = (zs_pset_t *)(cstart + csize);
4306	for (i = 0; i < cusage->zsu_npsets; i++) {
4307		csize += sizeof (zs_pset_t);
4308		/* LINTED */
4309		cpz = (zs_pset_zone_t *)(csize + cstart);
4310		foundpz = NULL;
4311		for (j = 0; j < cpset->zsp_nusage; j++) {
4312			if (cpz->zspz_zoneid == zid)
4313				foundpz = cpz;
4314
4315			csize += sizeof (zs_pset_zone_t);
4316			/* LINTED */
4317			cpz = (zs_pset_zone_t *)(csize + cstart);
4318		}
4319		if (foundpz != NULL) {
4320			size += sizeof (zs_pset_t);
4321			/* LINTED */
4322			pz = (zs_pset_zone_t *)(start + size);
4323			size += sizeof (zs_pset_zone_t);
4324
4325			*pset = *cpset;
4326			*pz = *foundpz;
4327
4328			TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4329			    pz->zspz_cpu_usage);
4330			TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4331			pset->zsp_usage_zones = pz->zspz_cpu_usage;
4332			pset->zsp_nusage = 1;
4333			usage->zsu_npsets++;
4334			sys->zss_ncpus += pset->zsp_size;
4335			sys->zss_ncpus_online += pset->zsp_online;
4336		}
4337		/* LINTED */
4338		cpset = (zs_pset_t *)(cstart + csize);
4339	}
4340	usage->zsu_size = size;
4341}
4342
4343/*
4344 * Respond to new connections from libzonestat.so.  Also respond to zoneadmd,
4345 * which reports new zones.
4346 */
4347/* ARGSUSED */
4348static void
4349zsd_server(void *cookie, char *argp, size_t arg_size,
4350    door_desc_t *dp, uint_t n_desc)
4351{
4352	int *args, cmd;
4353	door_desc_t door;
4354	ucred_t *ucred;
4355	const priv_set_t *eset;
4356
4357	if (argp == DOOR_UNREF_DATA) {
4358		(void) door_return(NULL, 0, NULL, 0);
4359		thr_exit(NULL);
4360	}
4361
4362	if (arg_size != sizeof (cmd) * 2) {
4363		(void) door_return(NULL, 0, NULL, 0);
4364		thr_exit(NULL);
4365	}
4366
4367	/* LINTED */
4368	args = (int *)argp;
4369	cmd = args[0];
4370
4371	/* If connection, return door to stat server */
4372	if (cmd == ZSD_CMD_CONNECT) {
4373
4374		/* Verify client compilation version */
4375		if (args[1] != ZS_VERSION) {
4376			args[1] = ZSD_STATUS_VERSION_MISMATCH;
4377			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4378			thr_exit(NULL);
4379		}
4380		ucred = alloca(ucred_size());
4381		/* Verify client permission */
4382		if (door_ucred(&ucred) != 0) {
4383			args[1] = ZSD_STATUS_INTERNAL_ERROR;
4384			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4385			thr_exit(NULL);
4386		}
4387
4388		eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4389		if (eset == NULL) {
4390			args[1] = ZSD_STATUS_INTERNAL_ERROR;
4391			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4392			thr_exit(NULL);
4393		}
4394		if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4395			args[1] = ZSD_STATUS_PERMISSION;
4396			(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4397			thr_exit(NULL);
4398		}
4399
4400		/* Return stat server door */
4401		args[1] = ZSD_STATUS_OK;
4402		door.d_attributes = DOOR_DESCRIPTOR;
4403		door.d_data.d_desc.d_descriptor = g_stat_door;
4404		(void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4405		thr_exit(NULL);
4406	}
4407
4408	/* Respond to zoneadmd informing zonestatd of a new zone */
4409	if (cmd == ZSD_CMD_NEW_ZONE) {
4410		zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4411		(void) door_return(NULL, 0, NULL, 0);
4412		thr_exit(NULL);
4413	}
4414
4415	args[1] = ZSD_STATUS_INTERNAL_ERROR;
4416	(void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4417	thr_exit(NULL);
4418}
4419
4420/*
4421 * Respond to libzonestat.so clients with the current utlilzation data.
4422 */
4423/* ARGSUSED */
4424static void
4425zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4426    door_desc_t *dp, uint_t n_desc)
4427{
4428	uint64_t *args, cmd;
4429	zs_usage_cache_t *cache;
4430	int ret;
4431	char *rvalp;
4432	size_t rvals;
4433	zs_usage_t *usage;
4434	ucred_t *ucred;
4435	zoneid_t zoneid;
4436	const priv_set_t *eset;
4437	boolean_t is_gz = B_FALSE;
4438
4439	/* Tell stat thread there are no more clients */
4440	if (argp == DOOR_UNREF_DATA) {
4441		(void) mutex_lock(&g_usage_cache_lock);
4442		g_hasclient = B_FALSE;
4443		(void) cond_signal(&g_usage_cache_kick);
4444		(void) mutex_unlock(&g_usage_cache_lock);
4445		(void) door_return(NULL, 0, NULL, 0);
4446		thr_exit(NULL);
4447	}
4448	if (arg_size != sizeof (cmd) * 2) {
4449		(void) door_return(NULL, 0, NULL, 0);
4450		thr_exit(NULL);
4451	}
4452	/* LINTED */
4453	args = (uint64_t *)argp;
4454	cmd = args[0];
4455	if (cmd != ZSD_CMD_READ) {
4456		(void) door_return(NULL, 0, NULL, 0);
4457		thr_exit(NULL);
4458	}
4459	ucred = alloca(ucred_size());
4460	if (door_ucred(&ucred) != 0) {
4461		(void) door_return(NULL, 0, NULL, 0);
4462		thr_exit(NULL);
4463	}
4464	zoneid = ucred_getzoneid(ucred);
4465
4466	if (zoneid == GLOBAL_ZONEID)
4467		is_gz = B_TRUE;
4468
4469	eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4470	if (eset == NULL) {
4471		(void) door_return(NULL, 0, NULL, 0);
4472		thr_exit(NULL);
4473	}
4474	if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4475		(void) door_return(NULL, 0, NULL, 0);
4476		thr_exit(NULL);
4477	}
4478	(void) mutex_lock(&g_usage_cache_lock);
4479	g_hasclient = B_TRUE;
4480
4481	/*
4482	 * Force a new cpu calculation for client.  This will force a
4483	 * new memory calculation if the memory data is older than the
4484	 * sample period.
4485	 */
4486	g_usage_cache_kickers++;
4487	(void) cond_signal(&g_usage_cache_kick);
4488	ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4489	g_usage_cache_kickers--;
4490	if (ret != 0 && errno == EINTR) {
4491		(void) mutex_unlock(&g_usage_cache_lock);
4492		zsd_warn(gettext(
4493		    "Interrupted before writing usage size to client\n"));
4494		(void) door_return(NULL, 0, NULL, 0);
4495		thr_exit(NULL);
4496	}
4497	cache = zsd_usage_cache_hold_locked();
4498	if (cache == NULL) {
4499		zsd_warn(gettext("Usage cache empty.\n"));
4500		(void) door_return(NULL, 0, NULL, 0);
4501		thr_exit(NULL);
4502	}
4503	(void) mutex_unlock(&g_usage_cache_lock);
4504
4505	/* Copy current usage data to stack to send to client */
4506	usage = (zs_usage_t *)alloca(cache->zsuc_size);
4507
4508	/* Filter out results if caller is non-global zone */
4509	zsd_usage_filter(zoneid, cache, usage, is_gz);
4510
4511	rvalp = (void *)usage;
4512	rvals = usage->zsu_size;
4513	zsd_usage_cache_rele(cache);
4514
4515	(void) door_return(rvalp, rvals, NULL, 0);
4516	thr_exit(NULL);
4517}
4518
4519static volatile boolean_t g_quit;
4520
4521/* ARGSUSED */
4522static void
4523zonestat_quithandler(int sig)
4524{
4525	g_quit = B_TRUE;
4526}
4527
4528/*
4529 * The stat thread generates new utilization data when clients request
4530 * it.  It also manages opening and closing the subsystems used to gather
4531 * data depending on if clients exist.
4532 */
4533/* ARGSUSED */
4534void *
4535stat_thread(void *arg)
4536{
4537	time_t start;
4538	time_t now;
4539	time_t next_memory;
4540	boolean_t do_memory;
4541	boolean_t do_read;
4542	boolean_t do_close;
4543
4544	start = time(NULL);
4545	if (start < 0) {
4546		if (g_quit == B_TRUE)
4547			goto quit;
4548		zsd_warn(gettext("Unable to fetch current time"));
4549		g_quit = B_TRUE;
4550		goto quit;
4551	}
4552
4553	next_memory = start;
4554	while (g_quit == B_FALSE) {
4555		for (;;) {
4556			/*
4557			 * These are used to decide if the most recent memory
4558			 * calculation was within a sample interval,
4559			 * and weather or not the usage collection needs to
4560			 * be opened or closed.
4561			 */
4562			do_memory = B_FALSE;
4563			do_read = B_FALSE;
4564			do_close = B_FALSE;
4565
4566			/*
4567			 * If all clients have gone, close usage collecting
4568			 */
4569			(void) mutex_lock(&g_usage_cache_lock);
4570			if (!g_hasclient && g_open == B_TRUE) {
4571				do_close = B_TRUE;
4572				(void) mutex_unlock(&g_usage_cache_lock);
4573				break;
4574			}
4575			if (g_quit == B_TRUE) {
4576				(void) mutex_unlock(
4577				    &g_usage_cache_lock);
4578				break;
4579			}
4580			/*
4581			 * Wait for a usage data request
4582			 */
4583			if (g_usage_cache_kickers == 0) {
4584				(void) cond_wait(&g_usage_cache_kick,
4585				    &g_usage_cache_lock);
4586			}
4587			now = time(NULL);
4588			if (now < 0) {
4589				if (g_quit == B_TRUE) {
4590					(void) mutex_unlock(
4591					    &g_usage_cache_lock);
4592					goto quit;
4593				}
4594				g_quit = B_TRUE;
4595				(void) mutex_unlock(&g_usage_cache_lock);
4596				zsd_warn(gettext(
4597				    "Unable to fetch current time"));
4598				goto quit;
4599			}
4600			if (g_hasclient) {
4601				do_read = B_TRUE;
4602				if (now >= next_memory) {
4603					do_memory = B_TRUE;
4604					next_memory = now + g_interval;
4605				}
4606			} else {
4607				do_close = B_TRUE;
4608			}
4609			(void) mutex_unlock(&g_usage_cache_lock);
4610			if (do_read || do_close)
4611				break;
4612		}
4613		g_now = now;
4614		g_hrnow = gethrtime();
4615		if (g_hasclient && g_open == B_FALSE) {
4616			g_start = g_now;
4617			g_hrstart = g_hrnow;
4618			g_ctl = zsd_open(g_ctl);
4619			if (g_ctl == NULL)
4620				zsd_warn(gettext(
4621				    "Unable to open zone statistics"));
4622			else
4623				g_open = B_TRUE;
4624		}
4625		if (do_read && g_ctl) {
4626			if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4627				zsd_warn(gettext(
4628				    "Unable to read zone statistics"));
4629				g_quit = B_TRUE;
4630				return (NULL);
4631			}
4632		}
4633		(void) mutex_lock(&g_usage_cache_lock);
4634		if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4635			(void) mutex_unlock(&g_usage_cache_lock);
4636			zsd_close(g_ctl);
4637			g_open = B_FALSE;
4638		} else {
4639			(void) mutex_unlock(&g_usage_cache_lock);
4640		}
4641	}
4642quit:
4643	if (g_open)
4644		zsd_close(g_ctl);
4645
4646	(void) thr_kill(g_main, SIGINT);
4647	thr_exit(NULL);
4648	return (NULL);
4649}
4650
4651void
4652zsd_set_fx()
4653{
4654	pcinfo_t pcinfo;
4655	pcparms_t pcparms;
4656
4657	(void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4658	if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4659		zsd_warn(gettext("cannot get FX class parameters"));
4660		return;
4661	}
4662	pcparms.pc_cid = pcinfo.pc_cid;
4663	((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4664	((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4665	((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4666	((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4667	if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4668		zsd_warn(gettext("cannot enter the FX class"));
4669}
4670
4671static int pipe_fd;
4672
4673static void
4674daemonize_ready(char status)
4675{
4676	/*
4677	 * wake the parent with a clue
4678	 */
4679	(void) write(pipe_fd, &status, 1);
4680	(void) close(pipe_fd);
4681}
4682
4683static int
4684daemonize_start(void)
4685{
4686	char data;
4687	int status;
4688
4689	int filedes[2];
4690	pid_t pid;
4691
4692	(void) close(0);
4693	(void) dup2(2, 1);
4694
4695	if (pipe(filedes) < 0)
4696		return (-1);
4697
4698	(void) fflush(NULL);
4699
4700	if ((pid = fork1()) < 0)
4701		return (-1);
4702
4703	if (pid != 0) {
4704		/*
4705		 * parent
4706		 */
4707		struct sigaction act;
4708
4709		act.sa_sigaction = SIG_DFL;
4710		(void) sigemptyset(&act.sa_mask);
4711		act.sa_flags = 0;
4712
4713		(void) sigaction(SIGPIPE, &act, NULL);  /* ignore SIGPIPE */
4714
4715		(void) close(filedes[1]);
4716		if (read(filedes[0], &data, 1) == 1) {
4717			/* forward ready code via exit status */
4718			exit(data);
4719		}
4720		status = -1;
4721		(void) wait4(pid, &status, 0, NULL);
4722		/* daemon process exited before becoming ready */
4723		if (WIFEXITED(status)) {
4724			/* assume daemon process printed useful message */
4725			exit(WEXITSTATUS(status));
4726		} else {
4727			zsd_warn(gettext("daemon process killed or died"));
4728			exit(1);
4729		}
4730	}
4731
4732	/*
4733	 * child
4734	 */
4735	pipe_fd = filedes[1];
4736	(void) close(filedes[0]);
4737
4738	/*
4739	 * generic Unix setup
4740	 */
4741	(void) setsid();
4742	(void) umask(0000);
4743
4744	return (0);
4745}
4746
4747static void
4748fattach_all_zones(boolean_t detach_only)
4749{
4750	zoneid_t *zids;
4751	uint_t nzids, nzids_last;
4752	int i;
4753
4754again:
4755	(void) zone_list(NULL, &nzids);
4756	nzids_last = nzids;
4757	zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4758	if (zids == NULL)
4759		zsd_error(gettext("Out of memory"));
4760
4761	(void) zone_list(zids, &nzids);
4762	if (nzids > nzids_last) {
4763		free(zids);
4764		goto again;
4765	}
4766	for (i = 0; i < nzids; i++)
4767		zsd_fattach_zone(zids[i], g_server_door, detach_only);
4768
4769	free(zids);
4770}
4771
4772int
4773main(int argc, char *argv[])
4774{
4775
4776	int arg;
4777	thread_t tid;
4778	scf_simple_prop_t *prop;
4779	uint64_t *intervalp;
4780	boolean_t opt_cleanup = B_FALSE;
4781
4782	g_main = thr_self();
4783	g_quit = B_FALSE;
4784	(void) signal(SIGINT, zonestat_quithandler);
4785	(void) signal(SIGTERM, zonestat_quithandler);
4786	(void) signal(SIGHUP, zonestat_quithandler);
4787/*	(void) sigignore(SIGCHLD); */
4788	(void) sigignore(SIGPIPE);
4789
4790	if (getzoneid() != GLOBAL_ZONEID)
4791		zsd_error(gettext("Must be run from global zone only"));
4792
4793	while ((arg = getopt(argc, argv, "c"))
4794	    != EOF) {
4795		switch (arg) {
4796		case 'c':
4797			opt_cleanup = B_TRUE;
4798			break;
4799		default:
4800			zsd_error(gettext("Invalid option"));
4801		}
4802	}
4803
4804	if (opt_cleanup) {
4805		if (zsd_disable_cpu_stats() != 0)
4806			exit(1);
4807		else
4808			exit(0);
4809	}
4810
4811	/* Get the configured sample interval */
4812	prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4813	    "config", "sample_interval");
4814	if (prop == NULL)
4815		zsd_error(gettext("Unable to fetch SMF property "
4816		    "\"config/sample_interval\""));
4817
4818	if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4819		zsd_error(gettext("Malformed SMF property "
4820		    "\"config/sample_interval\".  Must be of type \"count\""));
4821
4822	intervalp = scf_simple_prop_next_count(prop);
4823	g_interval = *intervalp;
4824	if (g_interval == 0)
4825		zsd_error(gettext("Malformed SMF property "
4826		    "\"config/sample_interval\".  Must be greater than zero"));
4827
4828	scf_simple_prop_free(prop);
4829
4830	if (daemonize_start() < 0)
4831		zsd_error(gettext("Unable to start daemon\n"));
4832
4833	/* Run at high priority */
4834	zsd_set_fx();
4835
4836	(void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4837	(void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4838	(void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4839
4840	g_server_door = door_create(zsd_server, NULL,
4841	    DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4842	if (g_server_door < 0)
4843		zsd_error(gettext("Unable to create server door\n"));
4844
4845
4846	g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4847	    DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4848	if (g_stat_door < 0)
4849		zsd_error(gettext("Unable to create statistics door\n"));
4850
4851	fattach_all_zones(B_FALSE);
4852
4853	if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4854		zsd_error(gettext("Unable to create statistics thread\n"));
4855
4856	daemonize_ready(0);
4857
4858	/* Wait for signal to quit */
4859	while (g_quit == B_FALSE)
4860		(void) pause();
4861
4862	/* detach doors */
4863	fattach_all_zones(B_TRUE);
4864
4865	(void) door_revoke(g_server_door);
4866	(void) door_revoke(g_stat_door);
4867
4868	/* kick stat thread and wait for it to close the statistics */
4869	(void) mutex_lock(&g_usage_cache_lock);
4870	g_quit = B_TRUE;
4871	(void) cond_signal(&g_usage_cache_kick);
4872	(void) mutex_unlock(&g_usage_cache_lock);
4873end:
4874	(void) thr_join(tid, NULL, NULL);
4875	return (0);
4876}
4877