1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 #include <alloca.h>
26 #include <assert.h>
27 #include <dirent.h>
28 #include <dlfcn.h>
29 #include <door.h>
30 #include <errno.h>
31 #include <exacct.h>
32 #include <ctype.h>
33 #include <fcntl.h>
34 #include <kstat.h>
35 #include <libcontract.h>
36 #include <libintl.h>
37 #include <libscf.h>
38 #include <zonestat.h>
39 #include <zonestat_impl.h>
40 #include <limits.h>
41 #include <pool.h>
42 #include <procfs.h>
43 #include <rctl.h>
44 #include <thread.h>
45 #include <signal.h>
46 #include <stdarg.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <strings.h>
51 #include <synch.h>
52 #include <sys/acctctl.h>
53 #include <sys/contract/process.h>
54 #include <sys/ctfs.h>
55 #include <sys/fork.h>
56 #include <sys/param.h>
57 #include <sys/priocntl.h>
58 #include <sys/fxpriocntl.h>
59 #include <sys/processor.h>
60 #include <sys/pset.h>
61 #include <sys/socket.h>
62 #include <sys/stat.h>
63 #include <sys/statvfs.h>
64 #include <sys/swap.h>
65 #include <sys/systeminfo.h>
66 #include <thread.h>
67 #include <sys/list.h>
68 #include <sys/time.h>
69 #include <sys/types.h>
70 #include <sys/vm_usage.h>
71 #include <sys/wait.h>
72 #include <sys/zone.h>
73 #include <time.h>
74 #include <ucred.h>
75 #include <unistd.h>
76 #include <vm/anon.h>
77 #include <zone.h>
78 #include <zonestat.h>
79 
80 #define	MAX_PSET_NAME	1024	/* Taken from PV_NAME_MAX_LEN */
81 #define	ZSD_PSET_UNLIMITED	UINT16_MAX
82 #define	ZONESTAT_EXACCT_FILE	"/var/adm/exacct/zonestat-process"
83 
84 /*
85  * zonestatd implements gathering cpu and memory utilization data for
86  * running zones.  It has these components:
87  *
88  * zsd_server:
89  *	Door server to respond to client connections.  Each client
90  *	will connect using libzonestat.so, which will open and
91  *	call /var/tmp/.zonestat_door.  Each connecting client is given
92  *	a file descriptor to the stat server.
93  *
94  *	The zsd_server also responds to zoneadmd, which reports when a
95  *	new zone is booted.  This is used to fattach the zsd_server door
96  *	into the new zone.
97  *
98  * zsd_stat_server:
99  *	Receives client requests for the current utilization data.  Each
100  *	client request will cause zonestatd to update the current utilization
101  *	data by kicking the stat_thread.
102  *
103  *	If the client is in a non-global zone, the utilization data will
104  *	be filtered to only show the given zone.  The usage by all other zones
105  *	will be added to the system utilization.
106  *
107  * stat_thread:
108  *	The stat thread implements querying the system to determine the
109  *	current utilization data for each running zone.  This includes
110  *	inspecting the system's processor set configuration, as well as details
111  *	of each zone, such as their configured limits, and which processor
112  *	sets they are running in.
113  *
114  *	The stat_thread will only update memory utilization data as often as
115  *	the configured config/sample_interval on the zones-monitoring service.
116  */
117 
118 /*
119  * The private vmusage structure unfortunately uses size_t types, and assumes
120  * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
121  * system call is contracted, and zonestatd is 32 bit, the following structures
122  * are used to interact with a 32bit or 64 bit kernel.
123  */
124 typedef struct zsd_vmusage32 {
125 	id_t vmu_zoneid;
126 	uint_t vmu_type;
127 	id_t vmu_id;
128 
129 	uint32_t vmu_rss_all;
130 	uint32_t vmu_rss_private;
131 	uint32_t vmu_rss_shared;
132 	uint32_t vmu_swap_all;
133 	uint32_t vmu_swap_private;
134 	uint32_t vmu_swap_shared;
135 } zsd_vmusage32_t;
136 
137 typedef struct zsd_vmusage64 {
138 	id_t vmu_zoneid;
139 	uint_t vmu_type;
140 	id_t vmu_id;
141 	/*
142 	 * An amd64 kernel will align the following uint64_t members, but a
143 	 * 32bit i386 process will not without help.
144 	 */
145 	int vmu_align_next_members_on_8_bytes;
146 	uint64_t vmu_rss_all;
147 	uint64_t vmu_rss_private;
148 	uint64_t vmu_rss_shared;
149 	uint64_t vmu_swap_all;
150 	uint64_t vmu_swap_private;
151 	uint64_t vmu_swap_shared;
152 } zsd_vmusage64_t;
153 
154 struct zsd_zone;
155 
156 /* Used to store a zone's usage of a pset */
157 typedef struct zsd_pset_usage {
158 	struct zsd_zone	*zsu_zone;
159 	struct zsd_pset	*zsu_pset;
160 
161 	list_node_t	zsu_next;
162 
163 	zoneid_t	zsu_zoneid;
164 	boolean_t	zsu_found;	/* zone bound at end of interval */
165 	boolean_t	zsu_active;	/* zone was bound during interval */
166 	boolean_t	zsu_new;	/* zone newly bound in this interval */
167 	boolean_t	zsu_deleted;	/* zone was unbound in this interval */
168 	boolean_t	zsu_empty;	/* no procs in pset in this interval */
169 	time_t		zsu_start;	/* time when zone was found in pset */
170 	hrtime_t	zsu_hrstart;	/* time when zone  was found in pset */
171 	uint64_t	zsu_cpu_shares;
172 	uint_t		zsu_scheds;	/* schedulers found in this pass */
173 	timestruc_t	zsu_cpu_usage;	/* cpu time used */
174 } zsd_pset_usage_t;
175 
176 /* Used to store a pset's utilization */
177 typedef struct zsd_pset {
178 	psetid_t	zsp_id;
179 	list_node_t	zsp_next;
180 	char		zsp_name[ZS_PSETNAME_MAX];
181 
182 	uint_t		zsp_cputype;	/* default, dedicated or shared */
183 	boolean_t	zsp_found;	/* pset found at end of interval */
184 	boolean_t	zsp_new;	/* pset new in this interval */
185 	boolean_t	zsp_deleted;	/* pset deleted in this interval */
186 	boolean_t	zsp_active;	/* pset existed during interval */
187 	boolean_t	zsp_empty;	/* no processes in pset */
188 	time_t		zsp_start;
189 	hrtime_t	zsp_hrstart;
190 
191 	uint64_t	zsp_online;	/* online cpus in interval */
192 	uint64_t	zsp_size;	/* size in this interval */
193 	uint64_t	zsp_min;	/* configured min in this interval */
194 	uint64_t	zsp_max;	/* configured max in this interval */
195 	int64_t		zsp_importance;	/* configured max in this interval */
196 
197 	uint_t		zsp_scheds;	/* scheds of processes found in pset */
198 	uint64_t	zsp_cpu_shares;	/* total shares in this interval */
199 
200 	timestruc_t	zsp_total_time;
201 	timestruc_t	zsp_usage_kern;
202 	timestruc_t	zsp_usage_zones;
203 
204 	/* Individual zone usages of pset */
205 	list_t		zsp_usage_list;
206 	int		zsp_nusage;
207 
208 	/* Summed kstat values from individual cpus in pset */
209 	timestruc_t	zsp_idle;
210 	timestruc_t	zsp_intr;
211 	timestruc_t	zsp_kern;
212 	timestruc_t	zsp_user;
213 
214 } zsd_pset_t;
215 
216 /* Used to track an individual cpu's utilization as reported by kstats */
217 typedef struct zsd_cpu {
218 	processorid_t	zsc_id;
219 	list_node_t	zsc_next;
220 	psetid_t	zsc_psetid;
221 	psetid_t	zsc_psetid_prev;
222 	zsd_pset_t	*zsc_pset;
223 
224 	boolean_t	zsc_found;	/* cpu online in this interval */
225 	boolean_t	zsc_onlined;	/* cpu onlined during this interval */
226 	boolean_t	zsc_offlined;	/* cpu offlined during this interval */
227 	boolean_t	zsc_active;	/* cpu online during this interval */
228 	boolean_t	zsc_allocated;	/* True if cpu has ever been found */
229 
230 	/* kstats this interval */
231 	uint64_t	zsc_nsec_idle;
232 	uint64_t	zsc_nsec_intr;
233 	uint64_t	zsc_nsec_kern;
234 	uint64_t	zsc_nsec_user;
235 
236 	/* kstats in most recent interval */
237 	uint64_t	zsc_nsec_idle_prev;
238 	uint64_t	zsc_nsec_intr_prev;
239 	uint64_t	zsc_nsec_kern_prev;
240 	uint64_t	zsc_nsec_user_prev;
241 
242 	/* Total kstat increases since zonestatd started reading kstats */
243 	timestruc_t	zsc_idle;
244 	timestruc_t	zsc_intr;
245 	timestruc_t	zsc_kern;
246 	timestruc_t	zsc_user;
247 
248 } zsd_cpu_t;
249 
250 /* Used to describe an individual zone and its utilization */
251 typedef struct zsd_zone {
252 	zoneid_t	zsz_id;
253 	list_node_t	zsz_next;
254 	char		zsz_name[ZS_ZONENAME_MAX];
255 	uint_t		zsz_cputype;
256 	uint_t		zsz_iptype;
257 	time_t		zsz_start;
258 	hrtime_t	zsz_hrstart;
259 
260 	char		zsz_pool[ZS_POOLNAME_MAX];
261 	char		zsz_pset[ZS_PSETNAME_MAX];
262 	int		zsz_default_sched;
263 	/* These are deduced by inspecting processes */
264 	psetid_t	zsz_psetid;
265 	uint_t		zsz_scheds;
266 
267 	boolean_t	zsz_new;	/* zone booted during this interval */
268 	boolean_t	zsz_deleted;	/* halted during this interval */
269 	boolean_t	zsz_active;	/* running in this interval */
270 	boolean_t	zsz_empty;	/* no processes in this interval */
271 	boolean_t	zsz_gone;	/* not installed in this interval */
272 	boolean_t	zsz_found;	/* Running at end of this interval */
273 
274 	uint64_t	zsz_cpu_shares;
275 	uint64_t	zsz_cpu_cap;
276 	uint64_t	zsz_ram_cap;
277 	uint64_t	zsz_locked_cap;
278 	uint64_t	zsz_vm_cap;
279 
280 	uint64_t	zsz_cpus_online;
281 	timestruc_t	zsz_cpu_usage;	/* cpu time of cpu cap */
282 	timestruc_t	zsz_cap_time;	/* cpu time of cpu cap */
283 	timestruc_t	zsz_share_time; /* cpu time of share of cpu */
284 	timestruc_t	zsz_pset_time;  /* time of all psets zone is bound to */
285 
286 	uint64_t	zsz_usage_ram;
287 	uint64_t	zsz_usage_locked;
288 	uint64_t	zsz_usage_vm;
289 
290 	uint64_t	zsz_processes_cap;
291 	uint64_t	zsz_lwps_cap;
292 	uint64_t	zsz_shm_cap;
293 	uint64_t	zsz_shmids_cap;
294 	uint64_t	zsz_semids_cap;
295 	uint64_t	zsz_msgids_cap;
296 	uint64_t	zsz_lofi_cap;
297 
298 	uint64_t	zsz_processes;
299 	uint64_t	zsz_lwps;
300 	uint64_t	zsz_shm;
301 	uint64_t	zsz_shmids;
302 	uint64_t	zsz_semids;
303 	uint64_t	zsz_msgids;
304 	uint64_t	zsz_lofi;
305 
306 } zsd_zone_t;
307 
308 /*
309  * Used to track the cpu usage of an individual processes.
310  *
311  * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
312  * to their zone.  As processes exit, their extended accounting records are
313  * read and the difference of their total and known usage is charged to their
314  * zone.
315  *
316  * If a process is never seen in /proc, the total usage on its extended
317  * accounting record will be charged to its zone.
318  */
319 typedef struct zsd_proc {
320 	list_node_t	zspr_next;
321 	pid_t		zspr_ppid;
322 	psetid_t	zspr_psetid;
323 	zoneid_t	zspr_zoneid;
324 	int		zspr_sched;
325 	timestruc_t	zspr_usage;
326 } zsd_proc_t;
327 
328 /* Used to track the overall resource usage of the system */
329 typedef struct zsd_system {
330 
331 	uint64_t zss_ram_total;
332 	uint64_t zss_ram_kern;
333 	uint64_t zss_ram_zones;
334 
335 	uint64_t zss_locked_kern;
336 	uint64_t zss_locked_zones;
337 
338 	uint64_t zss_vm_total;
339 	uint64_t zss_vm_kern;
340 	uint64_t zss_vm_zones;
341 
342 	uint64_t zss_swap_total;
343 	uint64_t zss_swap_used;
344 
345 	timestruc_t zss_idle;
346 	timestruc_t zss_intr;
347 	timestruc_t zss_kern;
348 	timestruc_t zss_user;
349 
350 	timestruc_t zss_cpu_total_time;
351 	timestruc_t zss_cpu_usage_kern;
352 	timestruc_t zss_cpu_usage_zones;
353 
354 	uint64_t zss_maxpid;
355 	uint64_t zss_processes_max;
356 	uint64_t zss_lwps_max;
357 	uint64_t zss_shm_max;
358 	uint64_t zss_shmids_max;
359 	uint64_t zss_semids_max;
360 	uint64_t zss_msgids_max;
361 	uint64_t zss_lofi_max;
362 
363 	uint64_t zss_processes;
364 	uint64_t zss_lwps;
365 	uint64_t zss_shm;
366 	uint64_t zss_shmids;
367 	uint64_t zss_semids;
368 	uint64_t zss_msgids;
369 	uint64_t zss_lofi;
370 
371 	uint64_t zss_ncpus;
372 	uint64_t zss_ncpus_online;
373 
374 } zsd_system_t;
375 
376 /*
377  * A dumping ground for various information and structures used to compute
378  * utilization.
379  *
380  * This structure is used to track the system while clients are connected.
381  * When The first client connects, a zsd_ctl is allocated and configured by
382  * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
383  */
384 typedef struct zsd_ctl {
385 	kstat_ctl_t	*zsctl_kstat_ctl;
386 
387 	/* To track extended accounting */
388 	int		zsctl_proc_fd;		/* Log currently being used */
389 	ea_file_t	zsctl_proc_eaf;
390 	struct stat64	zsctl_proc_stat;
391 	int		zsctl_proc_open;
392 	int		zsctl_proc_fd_next;	/* Log file to use next */
393 	ea_file_t	zsctl_proc_eaf_next;
394 	struct stat64	zsctl_proc_stat_next;
395 	int		zsctl_proc_open_next;
396 
397 	/* pool configuration handle */
398 	pool_conf_t	*zsctl_pool_conf;
399 	int		zsctl_pool_status;
400 	int		zsctl_pool_changed;
401 
402 	/* The above usage tacking structures */
403 	zsd_system_t	*zsctl_system;
404 	list_t		zsctl_zones;
405 	list_t		zsctl_psets;
406 	list_t		zsctl_cpus;
407 	zsd_cpu_t	*zsctl_cpu_array;
408 	zsd_proc_t	*zsctl_proc_array;
409 
410 	/* Various system info */
411 	uint64_t	zsctl_maxcpuid;
412 	uint64_t	zsctl_maxproc;
413 	uint64_t	zsctl_kern_bits;
414 	uint64_t	zsctl_pagesize;
415 
416 	/* Used to track time available under a cpu cap. */
417 	uint64_t	zsctl_hrtime;
418 	uint64_t	zsctl_hrtime_prev;
419 	timestruc_t	zsctl_hrtime_total;
420 
421 	struct timeval	zsctl_timeofday;
422 
423 	/* Caches for arrays allocated for use by various system calls */
424 	psetid_t	*zsctl_pset_cache;
425 	uint_t		zsctl_pset_ncache;
426 	processorid_t	*zsctl_cpu_cache;
427 	uint_t		zsctl_cpu_ncache;
428 	zoneid_t	*zsctl_zone_cache;
429 	uint_t		zsctl_zone_ncache;
430 	struct swaptable *zsctl_swap_cache;
431 	uint64_t	zsctl_swap_cache_size;
432 	uint64_t	zsctl_swap_cache_num;
433 	zsd_vmusage64_t	*zsctl_vmusage_cache;
434 	uint64_t	zsctl_vmusage_cache_num;
435 
436 	/* Info about procfs for scanning /proc */
437 	struct dirent	*zsctl_procfs_dent;
438 	long		zsctl_procfs_dent_size;
439 	pool_value_t	*zsctl_pool_vals[3];
440 
441 	/* Counts on tracked entities */
442 	uint_t		zsctl_nzones;
443 	uint_t		zsctl_npsets;
444 	uint_t		zsctl_npset_usages;
445 } zsd_ctl_t;
446 
447 zsd_ctl_t		*g_ctl;
448 boolean_t		g_open;		/* True if g_ctl is open */
449 int			g_hasclient;	/* True if any clients are connected */
450 
451 /*
452  * The usage cache is updated by the stat_thread, and copied to clients by
453  * the zsd_stat_server.  Mutex and cond are to synchronize between the
454  * stat_thread and the stat_server.
455  */
456 zs_usage_cache_t	*g_usage_cache;
457 mutex_t			g_usage_cache_lock;
458 cond_t			g_usage_cache_kick;
459 uint_t			g_usage_cache_kickers;
460 cond_t			g_usage_cache_wait;
461 char			*g_usage_cache_buf;
462 uint_t			g_usage_cache_bufsz;
463 uint64_t		g_gen_next;
464 
465 /* fds of door servers */
466 int			g_server_door;
467 int			g_stat_door;
468 
469 /*
470  * Starting and current time.  Used to throttle memory calculation, and to
471  * mark new zones and psets with their boot and creation time.
472  */
473 time_t			g_now;
474 time_t			g_start;
475 hrtime_t		g_hrnow;
476 hrtime_t		g_hrstart;
477 uint64_t		g_interval;
478 
479 /*
480  * main() thread.
481  */
482 thread_t		g_main;
483 
484 /* PRINTFLIKE1 */
485 static void
486 zsd_warn(const char *fmt, ...)
487 {
488 	va_list alist;
489 
490 	va_start(alist, fmt);
491 
492 	(void) fprintf(stderr, gettext("zonestat: Warning: "));
493 	(void) vfprintf(stderr, fmt, alist);
494 	(void) fprintf(stderr, "\n");
495 	va_end(alist);
496 }
497 
498 /* PRINTFLIKE1 */
499 static void
500 zsd_error(const char *fmt, ...)
501 {
502 	va_list alist;
503 
504 	va_start(alist, fmt);
505 
506 	(void) fprintf(stderr, gettext("zonestat: Error: "));
507 	(void) vfprintf(stderr, fmt, alist);
508 	(void) fprintf(stderr, "\n");
509 	va_end(alist);
510 	exit(1);
511 }
512 
513 /* Turns on extended accounting if not configured externally */
514 int
515 zsd_enable_cpu_stats()
516 {
517 	char *path = ZONESTAT_EXACCT_FILE;
518 	char oldfile[MAXPATHLEN];
519 	int ret, state = AC_ON;
520 	ac_res_t res[6];
521 
522 	/*
523 	 * Start a new accounting file  if accounting not configured
524 	 * externally.
525 	 */
526 
527 	res[0].ar_id = AC_PROC_PID;
528 	res[0].ar_state = AC_ON;
529 	res[1].ar_id = AC_PROC_ANCPID;
530 	res[1].ar_state = AC_ON;
531 	res[2].ar_id = AC_PROC_CPU;
532 	res[2].ar_state = AC_ON;
533 	res[3].ar_id = AC_PROC_TIME;
534 	res[3].ar_state = AC_ON;
535 	res[4].ar_id = AC_PROC_ZONENAME;
536 	res[4].ar_state = AC_ON;
537 	res[5].ar_id = AC_NONE;
538 	res[5].ar_state = AC_ON;
539 	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
540 		zsd_warn(gettext("Unable to set accounting resources"));
541 		return (-1);
542 	}
543 	/* Only set accounting file if none is configured */
544 	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
545 	if (ret < 0) {
546 
547 		(void) unlink(path);
548 		if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
549 		    == -1) {
550 			zsd_warn(gettext("Unable to set accounting file"));
551 			return (-1);
552 		}
553 	}
554 	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
555 		zsd_warn(gettext("Unable to enable accounting"));
556 		return (-1);
557 	}
558 	return (0);
559 }
560 
561 /* Turns off extended accounting if not configured externally */
562 int
563 zsd_disable_cpu_stats()
564 {
565 	char *path = ZONESTAT_EXACCT_FILE;
566 	int ret, state = AC_OFF;
567 	ac_res_t res[6];
568 	char oldfile[MAXPATHLEN];
569 
570 	/* If accounting file is externally configured, leave it alone */
571 	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
572 	if (ret == 0 && strcmp(oldfile, path) != 0)
573 		return (0);
574 
575 	res[0].ar_id = AC_PROC_PID;
576 	res[0].ar_state = AC_OFF;
577 	res[1].ar_id = AC_PROC_ANCPID;
578 	res[1].ar_state = AC_OFF;
579 	res[2].ar_id = AC_PROC_CPU;
580 	res[2].ar_state = AC_OFF;
581 	res[3].ar_id = AC_PROC_TIME;
582 	res[3].ar_state = AC_OFF;
583 	res[4].ar_id = AC_PROC_ZONENAME;
584 	res[4].ar_state = AC_OFF;
585 	res[5].ar_id = AC_NONE;
586 	res[5].ar_state = AC_OFF;
587 	if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
588 		zsd_warn(gettext("Unable to clear accounting resources"));
589 		return (-1);
590 	}
591 	if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
592 		zsd_warn(gettext("Unable to clear accounting file"));
593 		return (-1);
594 	}
595 	if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
596 		zsd_warn(gettext("Unable to diable accounting"));
597 		return (-1);
598 	}
599 
600 	(void) unlink(path);
601 	return (0);
602 }
603 
604 /*
605  * If not configured externally, deletes the current extended accounting file
606  * and starts a new one.
607  *
608  * Since the stat_thread holds an open handle to the accounting file, it will
609  * read all remaining entries from the old file before switching to
610  * read the new one.
611  */
612 int
613 zsd_roll_exacct(void)
614 {
615 	int ret;
616 	char *path = ZONESTAT_EXACCT_FILE;
617 	char oldfile[MAXPATHLEN];
618 
619 	/* If accounting file is externally configured, leave it alone */
620 	ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
621 	if (ret == 0 && strcmp(oldfile, path) != 0)
622 		return (0);
623 
624 	if (unlink(path) != 0)
625 		/* Roll it next time */
626 		return (0);
627 
628 	if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
629 		zsd_warn(gettext("Unable to set accounting file"));
630 		return (-1);
631 	}
632 	return (0);
633 }
634 
635 /* Contract stuff for zone_enter() */
636 int
637 init_template(void)
638 {
639 	int fd;
640 	int err = 0;
641 
642 	fd = open64(CTFS_ROOT "/process/template", O_RDWR);
643 	if (fd == -1)
644 		return (-1);
645 
646 	/*
647 	 * For now, zoneadmd doesn't do anything with the contract.
648 	 * Deliver no events, don't inherit, and allow it to be orphaned.
649 	 */
650 	err |= ct_tmpl_set_critical(fd, 0);
651 	err |= ct_tmpl_set_informative(fd, 0);
652 	err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
653 	err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
654 	if (err || ct_tmpl_activate(fd)) {
655 		(void) close(fd);
656 		return (-1);
657 	}
658 
659 	return (fd);
660 }
661 
662 /*
663  * Contract stuff for zone_enter()
664  */
665 int
666 contract_latest(ctid_t *id)
667 {
668 	int cfd, r;
669 	ct_stathdl_t st;
670 	ctid_t result;
671 
672 	if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
673 		return (errno);
674 
675 	if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
676 		(void) close(cfd);
677 		return (r);
678 	}
679 
680 	result = ct_status_get_id(st);
681 	ct_status_free(st);
682 	(void) close(cfd);
683 
684 	*id = result;
685 	return (0);
686 }
687 
688 static int
689 close_on_exec(int fd)
690 {
691 	int flags = fcntl(fd, F_GETFD, 0);
692 	if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
693 		return (0);
694 	return (-1);
695 }
696 
697 int
698 contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
699 {
700 	char path[PATH_MAX];
701 	int n, fd;
702 
703 	if (type == NULL)
704 		type = "all";
705 
706 	n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
707 	if (n >= sizeof (path)) {
708 		errno = ENAMETOOLONG;
709 		return (-1);
710 	}
711 
712 	fd = open64(path, oflag);
713 	if (fd != -1) {
714 		if (close_on_exec(fd) == -1) {
715 			int err = errno;
716 			(void) close(fd);
717 			errno = err;
718 			return (-1);
719 		}
720 	}
721 	return (fd);
722 }
723 
724 int
725 contract_abandon_id(ctid_t ctid)
726 {
727 	int fd, err;
728 
729 	fd = contract_open(ctid, "all", "ctl", O_WRONLY);
730 	if (fd == -1)
731 		return (errno);
732 
733 	err = ct_ctl_abandon(fd);
734 	(void) close(fd);
735 
736 	return (err);
737 }
738 /*
739  * Attach the zsd_server to a zone.  Called for each zone when zonestatd
740  * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
741  *
742  * Zone_enter is used to avoid reaching into zone to fattach door.
743  */
744 static void
745 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
746 {
747 	char *path = ZS_DOOR_PATH;
748 	int fd, pid, stat, tmpl_fd;
749 	ctid_t ct;
750 
751 	if ((tmpl_fd = init_template()) == -1) {
752 		zsd_warn("Unable to init template");
753 		return;
754 	}
755 
756 	pid = forkx(0);
757 	if (pid < 0) {
758 		(void) ct_tmpl_clear(tmpl_fd);
759 		zsd_warn(gettext(
760 		    "Unable to fork to add zonestat to zoneid %d\n"), zid);
761 		return;
762 	}
763 
764 	if (pid == 0) {
765 		(void) ct_tmpl_clear(tmpl_fd);
766 		(void) close(tmpl_fd);
767 		if (zid != 0 && zone_enter(zid) != 0) {
768 			if (errno == EINVAL) {
769 				_exit(0);
770 			}
771 			_exit(1);
772 		}
773 		(void) fdetach(path);
774 		(void) unlink(path);
775 		if (detach_only)
776 			_exit(0);
777 		fd = open(path, O_CREAT|O_RDWR, 0644);
778 		if (fd < 0)
779 			_exit(2);
780 		if (fattach(door, path) != 0)
781 			_exit(3);
782 		_exit(0);
783 	}
784 	if (contract_latest(&ct) == -1)
785 		ct = -1;
786 	(void) ct_tmpl_clear(tmpl_fd);
787 	(void) close(tmpl_fd);
788 	(void) contract_abandon_id(ct);
789 	while (waitpid(pid, &stat, 0) != pid)
790 		;
791 	if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
792 		return;
793 
794 	zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
795 
796 	if (WEXITSTATUS(stat) == 1)
797 		zsd_warn(gettext("Cannot entering zone"));
798 	else if (WEXITSTATUS(stat) == 2)
799 		zsd_warn(gettext("Unable to create door file: %s"), path);
800 	else if (WEXITSTATUS(stat) == 3)
801 		zsd_warn(gettext("Unable to fattach file: %s"), path);
802 
803 	zsd_warn(gettext("Internal error entering zone: %d"), zid);
804 }
805 
806 /*
807  * Zone lookup and allocation functions to manage list of currently running
808  * zones.
809  */
810 static zsd_zone_t *
811 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
812 {
813 	zsd_zone_t *zone;
814 
815 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
816 	    zone = list_next(&ctl->zsctl_zones, zone)) {
817 		if (strcmp(zone->zsz_name, zonename) == 0) {
818 			if (zoneid != -1)
819 				zone->zsz_id = zoneid;
820 			return (zone);
821 		}
822 	}
823 	return (NULL);
824 }
825 
826 static zsd_zone_t *
827 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
828 {
829 	zsd_zone_t *zone;
830 
831 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
832 	    zone = list_next(&ctl->zsctl_zones, zone)) {
833 		if (zone->zsz_id == zoneid)
834 			return (zone);
835 	}
836 	return (NULL);
837 }
838 
839 static zsd_zone_t *
840 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
841 {
842 	zsd_zone_t *zone;
843 
844 	if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
845 		return (NULL);
846 
847 	(void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
848 	zone->zsz_id = zoneid;
849 	zone->zsz_found = B_FALSE;
850 
851 	/*
852 	 * Allocate as deleted so if not found in first pass, zone is deleted
853 	 * from list.  This can happen if zone is returned by zone_list, but
854 	 * exits before first attempt to fetch zone details.
855 	 */
856 	zone->zsz_start = g_now;
857 	zone->zsz_hrstart = g_hrnow;
858 	zone->zsz_deleted = B_TRUE;
859 
860 	zone->zsz_cpu_shares = ZS_LIMIT_NONE;
861 	zone->zsz_cpu_cap = ZS_LIMIT_NONE;
862 	zone->zsz_ram_cap = ZS_LIMIT_NONE;
863 	zone->zsz_locked_cap = ZS_LIMIT_NONE;
864 	zone->zsz_vm_cap = ZS_LIMIT_NONE;
865 
866 	zone->zsz_processes_cap = ZS_LIMIT_NONE;
867 	zone->zsz_lwps_cap = ZS_LIMIT_NONE;
868 	zone->zsz_shm_cap = ZS_LIMIT_NONE;
869 	zone->zsz_shmids_cap = ZS_LIMIT_NONE;
870 	zone->zsz_semids_cap = ZS_LIMIT_NONE;
871 	zone->zsz_msgids_cap = ZS_LIMIT_NONE;
872 	zone->zsz_lofi_cap = ZS_LIMIT_NONE;
873 
874 	ctl->zsctl_nzones++;
875 
876 	return (zone);
877 }
878 
879 static zsd_zone_t *
880 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
881 {
882 	zsd_zone_t *zone, *tmp;
883 
884 	if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
885 		return (zone);
886 
887 	if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
888 		return (NULL);
889 
890 	/* Insert sorted by zonename */
891 	tmp = list_head(&ctl->zsctl_zones);
892 	while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
893 		tmp = list_next(&ctl->zsctl_zones, tmp);
894 
895 	list_insert_before(&ctl->zsctl_zones, tmp, zone);
896 	return (zone);
897 }
898 
899 /*
900  * Mark all zones as not existing.  As zones are found, they will
901  * be marked as existing.  If a zone is not found, then it must have
902  * halted.
903  */
904 static void
905 zsd_mark_zones_start(zsd_ctl_t *ctl)
906 {
907 
908 	zsd_zone_t *zone;
909 
910 	for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
911 	    zone = list_next(&ctl->zsctl_zones, zone)) {
912 		zone->zsz_found = B_FALSE;
913 	}
914 }
915 
916 /*
917  * Mark each zone as not using pset.  If processes are found using the
918  * pset, the zone will remain bound to the pset.  If none of a zones
919  * processes are bound to the pset, the zone's usage of the pset will
920  * be deleted.
921  *
922  */
923 static void
924 zsd_mark_pset_usage_start(zsd_pset_t *pset)
925 {
926 	zsd_pset_usage_t *usage;
927 
928 	for (usage = list_head(&pset->zsp_usage_list);
929 	    usage != NULL;
930 	    usage = list_next(&pset->zsp_usage_list, usage)) {
931 		usage->zsu_found = B_FALSE;
932 		usage->zsu_empty = B_TRUE;
933 	}
934 }
935 
936 /*
937  * Mark each pset as not existing.  If a pset is found, it will be marked
938  * as existing.  If a pset is not found, it wil be deleted.
939  */
940 static void
941 zsd_mark_psets_start(zsd_ctl_t *ctl)
942 {
943 	zsd_pset_t *pset;
944 
945 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
946 	    pset = list_next(&ctl->zsctl_psets, pset)) {
947 		pset->zsp_found = B_FALSE;
948 		zsd_mark_pset_usage_start(pset);
949 	}
950 }
951 
952 /*
953  * A pset was found.  Update its information
954  */
955 static void
956 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
957     uint64_t size, uint64_t min, uint64_t max, int64_t importance)
958 {
959 	pset->zsp_empty = B_TRUE;
960 	pset->zsp_deleted = B_FALSE;
961 
962 	assert(pset->zsp_found == B_FALSE);
963 
964 	/* update pset flags */
965 	if (pset->zsp_active == B_FALSE)
966 		/* pset not seen on previous interval.  It is new. */
967 		pset->zsp_new = B_TRUE;
968 	else
969 		pset->zsp_new = B_FALSE;
970 
971 	pset->zsp_found = B_TRUE;
972 	pset->zsp_cputype = type;
973 	pset->zsp_online = online;
974 	pset->zsp_size = size;
975 	pset->zsp_min = min;
976 	pset->zsp_max = max;
977 	pset->zsp_importance = importance;
978 	pset->zsp_cpu_shares = 0;
979 	pset->zsp_scheds = 0;
980 	pset->zsp_active = B_TRUE;
981 }
982 
983 /*
984  * A zone's process was found using a pset. Charge the process to the pset and
985  * the per-zone data for the pset.
986  */
987 static void
988 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
989 {
990 	zsd_zone_t *zone = usage->zsu_zone;
991 	zsd_pset_t *pset = usage->zsu_pset;
992 
993 	/* Nothing to do if already found */
994 	if (usage->zsu_found == B_TRUE)
995 		goto add_stats;
996 
997 	usage->zsu_found = B_TRUE;
998 	usage->zsu_empty = B_FALSE;
999 
1000 	usage->zsu_deleted = B_FALSE;
1001 	/* update usage flags */
1002 	if (usage->zsu_active == B_FALSE)
1003 		usage->zsu_new = B_TRUE;
1004 	else
1005 		usage->zsu_new = B_FALSE;
1006 
1007 	usage->zsu_scheds = 0;
1008 	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1009 	usage->zsu_active = B_TRUE;
1010 	pset->zsp_empty = B_FALSE;
1011 	zone->zsz_empty = B_FALSE;
1012 
1013 add_stats:
1014 	/* Detect zone's pset id, and if it is bound to multiple psets */
1015 	if (zone->zsz_psetid == ZS_PSET_ERROR)
1016 		zone->zsz_psetid = pset->zsp_id;
1017 	else if (zone->zsz_psetid != pset->zsp_id)
1018 		zone->zsz_psetid = ZS_PSET_MULTI;
1019 
1020 	usage->zsu_scheds |= sched;
1021 	pset->zsp_scheds |= sched;
1022 	zone->zsz_scheds |= sched;
1023 
1024 	/* Record if FSS is co-habitating with conflicting scheduler */
1025 	if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1026 	    usage->zsu_scheds & (
1027 	    ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1028 		usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1029 
1030 		pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1031 	}
1032 
1033 }
1034 
1035 /* Add cpu time for a process to a pset, zone, and system totals */
1036 static void
1037 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1038 {
1039 	zsd_system_t *system = ctl->zsctl_system;
1040 	zsd_zone_t *zone = usage->zsu_zone;
1041 	zsd_pset_t *pset = usage->zsu_pset;
1042 
1043 	TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1044 	TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1045 	TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1046 	TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1047 }
1048 
1049 /* Determine which processor sets have been deleted */
1050 static void
1051 zsd_mark_psets_end(zsd_ctl_t *ctl)
1052 {
1053 	zsd_pset_t *pset, *tmp;
1054 
1055 	/*
1056 	 * Mark pset as not exists, and deleted if it existed
1057 	 * previous interval.
1058 	 */
1059 	pset = list_head(&ctl->zsctl_psets);
1060 	while (pset != NULL) {
1061 		if (pset->zsp_found == B_FALSE) {
1062 			pset->zsp_empty = B_TRUE;
1063 			if (pset->zsp_deleted == B_TRUE) {
1064 				tmp = pset;
1065 				pset = list_next(&ctl->zsctl_psets, pset);
1066 				list_remove(&ctl->zsctl_psets, tmp);
1067 				free(tmp);
1068 				ctl->zsctl_npsets--;
1069 				continue;
1070 			} else {
1071 				/* Pset vanished during this interval */
1072 				pset->zsp_new = B_FALSE;
1073 				pset->zsp_deleted = B_TRUE;
1074 				pset->zsp_active = B_TRUE;
1075 			}
1076 		}
1077 		pset = list_next(&ctl->zsctl_psets, pset);
1078 	}
1079 }
1080 
1081 /* Determine which zones are no longer bound to processor sets */
1082 static void
1083 zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1084 {
1085 	zsd_pset_t *pset;
1086 	zsd_zone_t *zone;
1087 	zsd_pset_usage_t *usage, *tmp;
1088 
1089 	/*
1090 	 * Mark pset as not exists, and deleted if it existed previous
1091 	 * interval.
1092 	 */
1093 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1094 	    pset = list_next(&ctl->zsctl_psets, pset)) {
1095 		usage = list_head(&pset->zsp_usage_list);
1096 		while (usage != NULL) {
1097 			/*
1098 			 * Mark pset as not exists, and deleted if it existed
1099 			 * previous interval.
1100 			 */
1101 			if (usage->zsu_found == B_FALSE ||
1102 			    usage->zsu_zone->zsz_deleted == B_TRUE ||
1103 			    usage->zsu_pset->zsp_deleted == B_TRUE) {
1104 				tmp = usage;
1105 				usage = list_next(&pset->zsp_usage_list,
1106 				    usage);
1107 				list_remove(&pset->zsp_usage_list, tmp);
1108 				free(tmp);
1109 				pset->zsp_nusage--;
1110 				ctl->zsctl_npset_usages--;
1111 				continue;
1112 			} else {
1113 				usage->zsu_new = B_FALSE;
1114 				usage->zsu_deleted = B_TRUE;
1115 				usage->zsu_active = B_TRUE;
1116 			}
1117 			/* Add cpu shares for usages that are in FSS */
1118 			zone = usage->zsu_zone;
1119 			if (usage->zsu_scheds & ZS_SCHED_FSS &&
1120 			    zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1121 			    zone->zsz_cpu_shares != 0) {
1122 				zone = usage->zsu_zone;
1123 				usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1124 				pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1125 			}
1126 			usage = list_next(&pset->zsp_usage_list,
1127 			    usage);
1128 		}
1129 	}
1130 }
1131 
1132 /* A zone has been found.  Update its information */
1133 static void
1134 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1135     uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1136     uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1137     uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1138     uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1139     uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1140     uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1141     uint_t iptype)
1142 {
1143 	zsd_system_t *sys = ctl->zsctl_system;
1144 
1145 	assert(zone->zsz_found == B_FALSE);
1146 
1147 	/*
1148 	 * Mark zone as exists, and new if it did not exist in previous
1149 	 * interval.
1150 	 */
1151 	zone->zsz_found = B_TRUE;
1152 	zone->zsz_empty = B_TRUE;
1153 	zone->zsz_deleted = B_FALSE;
1154 
1155 	/*
1156 	 * Zone is new.  Assume zone's properties are the same over entire
1157 	 * interval.
1158 	 */
1159 	if (zone->zsz_active == B_FALSE)
1160 		zone->zsz_new = B_TRUE;
1161 	else
1162 		zone->zsz_new = B_FALSE;
1163 
1164 	(void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1165 	(void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1166 	zone->zsz_default_sched = sched;
1167 
1168 	/* Schedulers updated later as processes are found */
1169 	zone->zsz_scheds = 0;
1170 
1171 	/* Cpus updated later as psets bound are identified */
1172 	zone->zsz_cpus_online = 0;
1173 
1174 	zone->zsz_cputype = cputype;
1175 	zone->zsz_iptype = iptype;
1176 	zone->zsz_psetid = ZS_PSET_ERROR;
1177 	zone->zsz_cpu_cap = cpu_cap;
1178 	zone->zsz_cpu_shares = cpu_shares;
1179 	zone->zsz_ram_cap = ram_cap;
1180 	zone->zsz_locked_cap = locked_cap;
1181 	zone->zsz_vm_cap = vm_cap;
1182 	zone->zsz_processes_cap = processes_cap;
1183 	zone->zsz_processes = processes;
1184 	zone->zsz_lwps_cap = lwps_cap;
1185 	zone->zsz_lwps = lwps;
1186 	zone->zsz_shm_cap = shm_cap;
1187 	zone->zsz_shm = shm;
1188 	zone->zsz_shmids_cap = shmids_cap;
1189 	zone->zsz_shmids = shmids;
1190 	zone->zsz_semids_cap = semids_cap;
1191 	zone->zsz_semids = semids;
1192 	zone->zsz_msgids_cap = msgids_cap;
1193 	zone->zsz_msgids = msgids;
1194 	zone->zsz_lofi_cap = lofi_cap;
1195 	zone->zsz_lofi = lofi;
1196 
1197 	sys->zss_processes += processes;
1198 	sys->zss_lwps += lwps;
1199 	sys->zss_shm += shm;
1200 	sys->zss_shmids += shmids;
1201 	sys->zss_semids += semids;
1202 	sys->zss_msgids += msgids;
1203 	sys->zss_lofi += lofi;
1204 	zone->zsz_active = B_TRUE;
1205 }
1206 
1207 
1208 /* Determine which zones have halted */
1209 static void
1210 zsd_mark_zones_end(zsd_ctl_t *ctl)
1211 {
1212 	zsd_zone_t *zone, *tmp;
1213 
1214 	/*
1215 	 * Mark zone as not existing, or delete if it did not exist in
1216 	 * previous interval.
1217 	 */
1218 	zone = list_head(&ctl->zsctl_zones);
1219 	while (zone != NULL) {
1220 		if (zone->zsz_found == B_FALSE) {
1221 			zone->zsz_empty = B_TRUE;
1222 			if (zone->zsz_deleted == B_TRUE) {
1223 				/*
1224 				 * Zone deleted in prior interval,
1225 				 * so it no longer exists.
1226 				 */
1227 				tmp = zone;
1228 				zone = list_next(&ctl->zsctl_zones, zone);
1229 				list_remove(&ctl->zsctl_zones, tmp);
1230 				free(tmp);
1231 				ctl->zsctl_nzones--;
1232 				continue;
1233 			} else {
1234 				zone->zsz_new = B_FALSE;
1235 				zone->zsz_deleted = B_TRUE;
1236 				zone->zsz_active = B_TRUE;
1237 			}
1238 		}
1239 		zone = list_next(&ctl->zsctl_zones, zone);
1240 	}
1241 }
1242 
1243 /*
1244  * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
1245  * a cpu is not found, then it must have gone offline, so it will be
1246  * deleted.
1247  *
1248  * The kstat tracking data is rolled so that the usage since the previous
1249  * interval can be determined.
1250  */
1251 static void
1252 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1253 {
1254 	zsd_cpu_t *cpu;
1255 
1256 	/*
1257 	 * Mark all cpus as not existing.  As cpus are found, they will
1258 	 * be marked as existing.
1259 	 */
1260 	for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1261 	    cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1262 		cpu->zsc_found = B_FALSE;
1263 		if (cpu->zsc_active == B_TRUE && roll) {
1264 			cpu->zsc_psetid_prev = cpu->zsc_psetid;
1265 			cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1266 			cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1267 			cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1268 			cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1269 		}
1270 	}
1271 }
1272 
1273 /*
1274  * An array the size of the maximum number of cpus is kept.  Within this array
1275  * a list of the online cpus is maintained.
1276  */
1277 zsd_cpu_t *
1278 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1279 {
1280 	zsd_cpu_t *cpu;
1281 
1282 	assert(cpuid < ctl->zsctl_maxcpuid);
1283 	cpu = &(ctl->zsctl_cpu_array[cpuid]);
1284 	assert(cpuid == cpu->zsc_id);
1285 
1286 	if (cpu->zsc_allocated == B_FALSE) {
1287 		cpu->zsc_allocated = B_TRUE;
1288 		list_insert_tail(&ctl->zsctl_cpus, cpu);
1289 	}
1290 	return (cpu);
1291 }
1292 
1293 /* A cpu has been found.  Update its information */
1294 static void
1295 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1296 {
1297 	/*
1298 	 * legacy processor sets, the cpu may move while zonestatd is
1299 	 * inspecting, causing it to be found twice.  In this case, just
1300 	 * leave cpu in the first processor set in which it was found.
1301 	 */
1302 	if (cpu->zsc_found == B_TRUE)
1303 		return;
1304 
1305 	/* Mark cpu as online */
1306 	cpu->zsc_found = B_TRUE;
1307 	cpu->zsc_offlined = B_FALSE;
1308 	cpu->zsc_pset = pset;
1309 	/*
1310 	 * cpu is newly online.
1311 	 */
1312 	if (cpu->zsc_active == B_FALSE) {
1313 		/*
1314 		 * Cpu is newly online.
1315 		 */
1316 		cpu->zsc_onlined = B_TRUE;
1317 		cpu->zsc_psetid = psetid;
1318 		cpu->zsc_psetid_prev = psetid;
1319 	} else {
1320 		/*
1321 		 * cpu online during previous interval.  Save properties at
1322 		 * start of interval
1323 		 */
1324 		cpu->zsc_onlined = B_FALSE;
1325 		cpu->zsc_psetid = psetid;
1326 
1327 	}
1328 	cpu->zsc_active = B_TRUE;
1329 }
1330 
1331 /* Remove all offlined cpus from the list of tracked cpus */
1332 static void
1333 zsd_mark_cpus_end(zsd_ctl_t *ctl)
1334 {
1335 	zsd_cpu_t *cpu, *tmp;
1336 	int id;
1337 
1338 	/* Mark cpu as online or offline */
1339 	cpu = list_head(&ctl->zsctl_cpus);
1340 	while (cpu != NULL) {
1341 		if (cpu->zsc_found == B_FALSE) {
1342 			if (cpu->zsc_offlined == B_TRUE) {
1343 				/*
1344 				 * cpu offlined in prior interval. It is gone.
1345 				 */
1346 				tmp = cpu;
1347 				cpu = list_next(&ctl->zsctl_cpus, cpu);
1348 				list_remove(&ctl->zsctl_cpus, tmp);
1349 				/* Clear structure for future use */
1350 				id = tmp->zsc_id;
1351 				bzero(tmp, sizeof (zsd_cpu_t));
1352 				tmp->zsc_id = id;
1353 				tmp->zsc_allocated = B_FALSE;
1354 				tmp->zsc_psetid = ZS_PSET_ERROR;
1355 				tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1356 
1357 			} else {
1358 				/*
1359 				 * cpu online at start of interval.  Treat
1360 				 * as still online, since it was online for
1361 				 * some portion of the interval.
1362 				 */
1363 				cpu->zsc_offlined = B_TRUE;
1364 				cpu->zsc_onlined = B_FALSE;
1365 				cpu->zsc_active = B_TRUE;
1366 				cpu->zsc_psetid = cpu->zsc_psetid_prev;
1367 				cpu->zsc_pset = NULL;
1368 			}
1369 		}
1370 		cpu = list_next(&ctl->zsctl_cpus, cpu);
1371 	}
1372 }
1373 
1374 /* Some utility functions for managing the list of processor sets */
1375 static zsd_pset_t *
1376 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1377 {
1378 	zsd_pset_t *pset;
1379 
1380 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1381 	    pset = list_next(&ctl->zsctl_psets, pset)) {
1382 		if (pset->zsp_id == psetid)
1383 			return (pset);
1384 	}
1385 	return (NULL);
1386 }
1387 
1388 static zsd_pset_t *
1389 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1390 {
1391 	zsd_pset_t *pset;
1392 
1393 	for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1394 	    pset = list_next(&ctl->zsctl_psets, pset)) {
1395 		if (strcmp(pset->zsp_name, psetname) == 0) {
1396 			if (psetid != -1)
1397 				pset->zsp_id = psetid;
1398 			return (pset);
1399 		}
1400 	}
1401 	return (NULL);
1402 }
1403 
1404 static zsd_pset_t *
1405 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1406 {
1407 	zsd_pset_t *pset;
1408 
1409 	if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1410 		return (NULL);
1411 
1412 	(void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1413 	pset->zsp_id = psetid;
1414 	pset->zsp_found = B_FALSE;
1415 	/*
1416 	 * Allocate as deleted so if not found in first pass, pset is deleted
1417 	 * from list.  This can happen if pset is returned by pset_list, but
1418 	 * is destroyed before first attempt to fetch pset details.
1419 	 */
1420 	list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1421 	    offsetof(zsd_pset_usage_t, zsu_next));
1422 
1423 	pset->zsp_hrstart = g_hrnow;
1424 	pset->zsp_deleted = B_TRUE;
1425 	pset->zsp_empty = B_TRUE;
1426 	ctl->zsctl_npsets++;
1427 
1428 	return (pset);
1429 }
1430 
1431 static zsd_pset_t *
1432 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1433 {
1434 	zsd_pset_t *pset, *tmp;
1435 
1436 	if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1437 		return (pset);
1438 
1439 	if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1440 		return (NULL);
1441 
1442 	/* Insert sorted by psetname */
1443 	tmp = list_head(&ctl->zsctl_psets);
1444 	while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1445 		tmp = list_next(&ctl->zsctl_psets, tmp);
1446 
1447 	list_insert_before(&ctl->zsctl_psets, tmp, pset);
1448 	return (pset);
1449 }
1450 
1451 /* Some utility functions for managing the list of zones using each pset */
1452 static zsd_pset_usage_t *
1453 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1454 {
1455 	zsd_pset_usage_t *usage;
1456 
1457 	for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1458 	    usage = list_next(&pset->zsp_usage_list, usage))
1459 		if (usage->zsu_zone == zone)
1460 			return (usage);
1461 
1462 	return (NULL);
1463 }
1464 
1465 static zsd_pset_usage_t *
1466 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1467 {
1468 	zsd_pset_usage_t *usage;
1469 
1470 	if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1471 	    == NULL)
1472 		return (NULL);
1473 
1474 	list_link_init(&usage->zsu_next);
1475 	usage->zsu_zone = zone;
1476 	usage->zsu_zoneid = zone->zsz_id;
1477 	usage->zsu_pset = pset;
1478 	usage->zsu_found = B_FALSE;
1479 	usage->zsu_active = B_FALSE;
1480 	usage->zsu_new = B_FALSE;
1481 	/*
1482 	 * Allocate as not deleted.  If a process is found in a pset for
1483 	 * a zone, the usage will not be deleted until at least the next
1484 	 * interval.
1485 	 */
1486 	usage->zsu_start = g_now;
1487 	usage->zsu_hrstart = g_hrnow;
1488 	usage->zsu_deleted = B_FALSE;
1489 	usage->zsu_empty = B_TRUE;
1490 	usage->zsu_scheds = 0;
1491 	usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1492 
1493 	ctl->zsctl_npset_usages++;
1494 	pset->zsp_nusage++;
1495 
1496 	return (usage);
1497 }
1498 
1499 static zsd_pset_usage_t *
1500 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1501 {
1502 	zsd_pset_usage_t *usage, *tmp;
1503 
1504 	if ((usage = zsd_lookup_usage(pset, zone))
1505 	    != NULL)
1506 		return (usage);
1507 
1508 	if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1509 		return (NULL);
1510 
1511 	tmp = list_head(&pset->zsp_usage_list);
1512 	while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1513 	    > 0)
1514 		tmp = list_next(&pset->zsp_usage_list, tmp);
1515 
1516 	list_insert_before(&pset->zsp_usage_list, tmp, usage);
1517 	return (usage);
1518 }
1519 
1520 static void
1521 zsd_refresh_system(zsd_ctl_t *ctl)
1522 {
1523 	zsd_system_t *system = ctl->zsctl_system;
1524 
1525 	/* Re-count these values each interval */
1526 	system->zss_processes = 0;
1527 	system->zss_lwps = 0;
1528 	system->zss_shm = 0;
1529 	system->zss_shmids = 0;
1530 	system->zss_semids = 0;
1531 	system->zss_msgids = 0;
1532 	system->zss_lofi = 0;
1533 }
1534 
1535 
1536 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1537 static void
1538 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1539 {
1540 	zsd_system_t *sys;
1541 	processorid_t cpuid;
1542 	zsd_pset_t *pset_prev;
1543 	zsd_pset_t *pset;
1544 	kstat_t *kstat;
1545 	kstat_named_t *knp;
1546 	kid_t kid;
1547 	uint64_t idle, intr, kern, user;
1548 
1549 	sys = ctl->zsctl_system;
1550 	pset = cpu->zsc_pset;
1551 	knp = NULL;
1552 	kid = -1;
1553 	cpuid = cpu->zsc_id;
1554 
1555 	/* Get the cpu time totals for this cpu */
1556 	kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1557 	if (kstat == NULL)
1558 		return;
1559 
1560 	kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1561 	if (kid == -1)
1562 		return;
1563 
1564 	knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1565 	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1566 		return;
1567 
1568 	idle = knp->value.ui64;
1569 
1570 	knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1571 	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1572 		return;
1573 
1574 	kern = knp->value.ui64;
1575 
1576 	knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1577 	if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1578 		return;
1579 
1580 	user = knp->value.ui64;
1581 
1582 	/*
1583 	 * Tracking intr time per cpu just exists for future enhancements.
1584 	 * The value is presently always zero.
1585 	 */
1586 	intr = 0;
1587 	cpu->zsc_nsec_idle = idle;
1588 	cpu->zsc_nsec_intr = intr;
1589 	cpu->zsc_nsec_kern = kern;
1590 	cpu->zsc_nsec_user = user;
1591 
1592 	if (cpu->zsc_onlined == B_TRUE) {
1593 		/*
1594 		 * cpu is newly online.  There is no reference value,
1595 		 * so just record its current stats for comparison
1596 		 * on next stat read.
1597 		 */
1598 		cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1599 		cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1600 		cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1601 		cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1602 		return;
1603 	}
1604 
1605 	/*
1606 	 * Calculate relative time since previous refresh.
1607 	 * Paranoia.  Don't let time  go backwards.
1608 	 */
1609 	idle = intr = kern = user = 0;
1610 	if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1611 		idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1612 
1613 	if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1614 		intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1615 
1616 	if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1617 		kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1618 
1619 	if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1620 		user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1621 
1622 	/* Update totals for cpu usage */
1623 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1624 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1625 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1626 	TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1627 
1628 	/*
1629 	 * Add cpu's stats to its pset if it is known to be in
1630 	 * the pset since previous read.
1631 	 */
1632 	if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1633 	    cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1634 	    (pset_prev = zsd_lookup_pset_byid(ctl,
1635 	    cpu->zsc_psetid_prev)) == NULL) {
1636 		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1637 		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1638 		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1639 		TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1640 	} else {
1641 		/*
1642 		 * Last pset was different than current pset.
1643 		 * Best guess is to split usage between the two.
1644 		 */
1645 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1646 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1647 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1648 		TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1649 
1650 		TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1651 		    (idle / 2) + (idle % 2));
1652 		TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1653 		    (intr / 2) + (intr % 2));
1654 		TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1655 		    (kern / 2) + (kern % 2));
1656 		TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1657 		    (user / 2) + (user % 2));
1658 	}
1659 	TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1660 	TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1661 	TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1662 	TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1663 }
1664 
1665 /* Determine the details of a processor set by pset_id */
1666 static int
1667 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1668     size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1669     uint64_t *min, uint64_t *max, int64_t *importance)
1670 {
1671 	uint_t old, num;
1672 
1673 	pool_conf_t *conf = ctl->zsctl_pool_conf;
1674 	pool_value_t **vals = ctl->zsctl_pool_vals;
1675 	pool_resource_t **res_list = NULL;
1676 	pool_resource_t *pset;
1677 	pool_component_t **cpus = NULL;
1678 	processorid_t *cache;
1679 	const char *string;
1680 	uint64_t uint64;
1681 	int64_t int64;
1682 	int i, ret, type;
1683 
1684 	if (ctl->zsctl_pool_status == POOL_DISABLED) {
1685 
1686 		/*
1687 		 * Inspect legacy psets
1688 		 */
1689 		for (;;) {
1690 			old = num = ctl->zsctl_cpu_ncache;
1691 			ret = pset_info(psetid, &type, &num,
1692 			    ctl->zsctl_cpu_cache);
1693 			if (ret < 0) {
1694 				/* pset is gone.  Tell caller to retry */
1695 				errno = EINTR;
1696 				return (-1);
1697 			}
1698 			if (num <= old) {
1699 			/* Success */
1700 				break;
1701 			}
1702 			if ((cache = (processorid_t *)realloc(
1703 			    ctl->zsctl_cpu_cache, num *
1704 			    sizeof (processorid_t))) != NULL) {
1705 				ctl->zsctl_cpu_ncache = num;
1706 				ctl->zsctl_cpu_cache = cache;
1707 			} else {
1708 				/*
1709 				 * Could not allocate to get new cpu list.
1710 				 */
1711 				zsd_warn(gettext(
1712 				    "Could not allocate for cpu list"));
1713 				errno = ENOMEM;
1714 				return (-1);
1715 			}
1716 		}
1717 		/*
1718 		 * Old school pset.  Just make min and max equal
1719 		 * to its size
1720 		 */
1721 		if (psetid == ZS_PSET_DEFAULT) {
1722 			*cputype = ZS_CPUTYPE_DEFAULT_PSET;
1723 			(void) strlcpy(psetname, "pset_default", namelen);
1724 		} else {
1725 			*cputype = ZS_CPUTYPE_PSRSET_PSET;
1726 			(void) snprintf(psetname, namelen,
1727 			    "SUNWlegacy_pset_%d", psetid);
1728 		}
1729 
1730 		/*
1731 		 * Just treat legacy pset as a simple pool pset
1732 		 */
1733 		*online = num;
1734 		*size = num;
1735 		*min = num;
1736 		*max = num;
1737 		*importance = 1;
1738 
1739 		return (0);
1740 	}
1741 
1742 	/* Look up the pool pset using the pset id */
1743 	res_list = NULL;
1744 	pool_value_set_int64(vals[1], psetid);
1745 	if (pool_value_set_name(vals[1], "pset.sys_id")
1746 	    != PO_SUCCESS)
1747 		goto err;
1748 
1749 	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1750 		goto err;
1751 	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1752 		goto err;
1753 	if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1754 		goto err;
1755 	if (num != 1)
1756 		goto err;
1757 	pset = res_list[0];
1758 	free(res_list);
1759 	res_list = NULL;
1760 	if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1761 	    "pset.name", vals[0]) != POC_STRING ||
1762 	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1763 		goto err;
1764 
1765 	(void) strlcpy(psetname, string, namelen);
1766 	if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1767 		*cputype = ZS_CPUTYPE_DEDICATED;
1768 	else if (psetid == ZS_PSET_DEFAULT)
1769 		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
1770 	else
1771 		*cputype = ZS_CPUTYPE_POOL_PSET;
1772 
1773 	/* Get size, min, max, and importance */
1774 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1775 	    pset), "pset.size", vals[0]) == POC_UINT &&
1776 	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1777 		*size = uint64;
1778 	else
1779 		*size = 0;
1780 
1781 		/* Get size, min, max, and importance */
1782 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1783 	    pset), "pset.min", vals[0]) == POC_UINT &&
1784 	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1785 		*min = uint64;
1786 	else
1787 		*min = 0;
1788 	if (*min >= ZSD_PSET_UNLIMITED)
1789 		*min = ZS_LIMIT_NONE;
1790 
1791 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1792 	    pset), "pset.max", vals[0]) == POC_UINT &&
1793 	    pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1794 		*max = uint64;
1795 	else
1796 		*max = ZS_LIMIT_NONE;
1797 
1798 	if (*max >= ZSD_PSET_UNLIMITED)
1799 		*max = ZS_LIMIT_NONE;
1800 
1801 	if (pool_get_property(conf, pool_resource_to_elem(conf,
1802 	    pset), "pset.importance", vals[0]) == POC_INT &&
1803 	    pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1804 		*importance = int64;
1805 	else
1806 		*importance = (uint64_t)1;
1807 
1808 	*online = 0;
1809 	if (*size == 0)
1810 		return (0);
1811 
1812 	/* get cpus */
1813 	cpus = pool_query_resource_components(conf, pset, &num, NULL);
1814 	if (cpus == NULL)
1815 		goto err;
1816 
1817 	/* Make sure there is space for cpu id list */
1818 	if (num > ctl->zsctl_cpu_ncache) {
1819 		if ((cache = (processorid_t *)realloc(
1820 		    ctl->zsctl_cpu_cache, num *
1821 		    sizeof (processorid_t))) != NULL) {
1822 			ctl->zsctl_cpu_ncache = num;
1823 			ctl->zsctl_cpu_cache = cache;
1824 		} else {
1825 			/*
1826 			 * Could not allocate to get new cpu list.
1827 			 */
1828 			zsd_warn(gettext(
1829 			    "Could not allocate for cpu list"));
1830 			goto err;
1831 		}
1832 	}
1833 
1834 	/* count the online cpus */
1835 	for (i = 0; i < num; i++) {
1836 		if (pool_get_property(conf, pool_component_to_elem(
1837 		    conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1838 		    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1839 			goto err;
1840 
1841 		if (strcmp(string, "on-line") != 0 &&
1842 		    strcmp(string, "no-intr") != 0)
1843 			continue;
1844 
1845 		if (pool_get_property(conf, pool_component_to_elem(
1846 		    conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1847 		    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1848 			goto err;
1849 
1850 		(*online)++;
1851 		ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1852 	}
1853 	free(cpus);
1854 	return (0);
1855 err:
1856 	if (res_list != NULL)
1857 		free(res_list);
1858 	if (cpus != NULL)
1859 		free(cpus);
1860 
1861 	/*
1862 	 * The pools operations should succeed since the conf is a consistent
1863 	 * snapshot.  Tell caller there is no need to retry.
1864 	 */
1865 	errno = EINVAL;
1866 	return (-1);
1867 }
1868 
1869 /*
1870  * Update the current list of processor sets.
1871  * This also updates the list of online cpus, and each cpu's pset membership.
1872  */
1873 static void
1874 zsd_refresh_psets(zsd_ctl_t *ctl)
1875 {
1876 	int i, j, ret, state;
1877 	uint_t old, num;
1878 	uint_t cputype;
1879 	int64_t sys_id, importance;
1880 	uint64_t online, size, min, max;
1881 	zsd_system_t *system;
1882 	zsd_pset_t *pset;
1883 	zsd_cpu_t *cpu;
1884 	psetid_t *cache;
1885 	char psetname[ZS_PSETNAME_MAX];
1886 	processorid_t cpuid;
1887 	pool_value_t *pv_save = NULL;
1888 	pool_resource_t **res_list = NULL;
1889 	pool_resource_t *res;
1890 	pool_value_t **vals;
1891 	pool_conf_t *conf;
1892 	boolean_t roll_cpus = B_TRUE;
1893 
1894 	/* Zero cpu counters to recount them */
1895 	system = ctl->zsctl_system;
1896 	system->zss_ncpus = 0;
1897 	system->zss_ncpus_online = 0;
1898 retry:
1899 	ret = pool_get_status(&state);
1900 	if (ret == 0 && state == POOL_ENABLED) {
1901 
1902 		conf = ctl->zsctl_pool_conf;
1903 		vals = ctl->zsctl_pool_vals;
1904 		pv_save = vals[1];
1905 		vals[1] = NULL;
1906 
1907 		if (ctl->zsctl_pool_status == POOL_DISABLED) {
1908 			if (pool_conf_open(ctl->zsctl_pool_conf,
1909 			    pool_dynamic_location(), PO_RDONLY) == 0) {
1910 				ctl->zsctl_pool_status = POOL_ENABLED;
1911 				ctl->zsctl_pool_changed = POU_PSET;
1912 			}
1913 		} else {
1914 			ctl->zsctl_pool_changed = 0;
1915 			ret = pool_conf_update(ctl->zsctl_pool_conf,
1916 			    &(ctl->zsctl_pool_changed));
1917 			if (ret < 0) {
1918 				/* Pools must have become disabled */
1919 				(void) pool_conf_close(ctl->zsctl_pool_conf);
1920 				ctl->zsctl_pool_status = POOL_DISABLED;
1921 				if (pool_error() == POE_SYSTEM && errno ==
1922 				    ENOTACTIVE)
1923 					goto retry;
1924 
1925 				zsd_warn(gettext(
1926 				    "Unable to update pool configuration"));
1927 				/* Not able to get pool info.  Don't update. */
1928 				goto err;
1929 			}
1930 		}
1931 		/* Get the list of psets using libpool */
1932 		if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1933 			goto err;
1934 
1935 		if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1936 			goto err;
1937 		if ((res_list = pool_query_resources(conf, &num, vals))
1938 		    == NULL)
1939 			goto err;
1940 
1941 		if (num > ctl->zsctl_pset_ncache)  {
1942 			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1943 			    (num) * sizeof (psetid_t))) == NULL) {
1944 				goto err;
1945 			}
1946 			ctl->zsctl_pset_ncache = num;
1947 			ctl->zsctl_pset_cache = cache;
1948 		}
1949 		/* Save the pset id of each pset */
1950 		for (i = 0; i < num; i++) {
1951 			res = res_list[i];
1952 			if (pool_get_property(conf, pool_resource_to_elem(conf,
1953 			    res), "pset.sys_id", vals[0]) != POC_INT ||
1954 			    pool_value_get_int64(vals[0], &sys_id)
1955 			    != PO_SUCCESS)
1956 				goto err;
1957 			ctl->zsctl_pset_cache[i] = (int)sys_id;
1958 		}
1959 		vals[1] = pv_save;
1960 		pv_save = NULL;
1961 	} else {
1962 		if (ctl->zsctl_pool_status == POOL_ENABLED) {
1963 			(void) pool_conf_close(ctl->zsctl_pool_conf);
1964 			ctl->zsctl_pool_status = POOL_DISABLED;
1965 		}
1966 		/* Get the pset list using legacy psets */
1967 		for (;;) {
1968 			old = num = ctl->zsctl_pset_ncache;
1969 			(void) pset_list(ctl->zsctl_pset_cache, &num);
1970 			if ((num + 1) <= old) {
1971 				break;
1972 			}
1973 			if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1974 			    (num + 1) * sizeof (psetid_t))) != NULL) {
1975 				ctl->zsctl_pset_ncache = num + 1;
1976 				ctl->zsctl_pset_cache = cache;
1977 			} else {
1978 				/*
1979 				 * Could not allocate to get new pset list.
1980 				 * Give up
1981 				 */
1982 				return;
1983 			}
1984 		}
1985 		/* Add the default pset to list */
1986 		ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1987 		ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1988 		num++;
1989 	}
1990 psets_changed:
1991 	zsd_mark_cpus_start(ctl, roll_cpus);
1992 	zsd_mark_psets_start(ctl);
1993 	roll_cpus = B_FALSE;
1994 
1995 	/* Refresh cpu membership of all psets */
1996 	for (i = 0; i < num; i++) {
1997 
1998 		/* Get pool pset information */
1999 		sys_id = ctl->zsctl_pset_cache[i];
2000 		if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
2001 		    &cputype, &online, &size, &min, &max, &importance)
2002 		    != 0) {
2003 			if (errno == EINTR)
2004 				goto psets_changed;
2005 			zsd_warn(gettext("Failed to get info for pset %d"),
2006 			    sys_id);
2007 			continue;
2008 		}
2009 
2010 		system->zss_ncpus += size;
2011 		system->zss_ncpus_online += online;
2012 
2013 		pset = zsd_lookup_insert_pset(ctl, psetname,
2014 		    ctl->zsctl_pset_cache[i]);
2015 
2016 		/* update pset info */
2017 		zsd_mark_pset_found(pset, cputype, online, size, min,
2018 		    max, importance);
2019 
2020 		/* update each cpu in pset */
2021 		for (j = 0; j < pset->zsp_online; j++) {
2022 			cpuid = ctl->zsctl_cpu_cache[j];
2023 			cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2024 			zsd_mark_cpu_found(cpu, pset, sys_id);
2025 		}
2026 	}
2027 err:
2028 	if (res_list != NULL)
2029 		free(res_list);
2030 	if (pv_save != NULL)
2031 		vals[1] = pv_save;
2032 }
2033 
2034 
2035 
2036 /*
2037  * Fetch the current pool and pset name for the given zone.
2038  */
2039 static void
2040 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2041     char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2042 {
2043 	poolid_t poolid;
2044 	pool_t **pools = NULL;
2045 	pool_resource_t **res_list = NULL;
2046 	char poolname[ZS_POOLNAME_MAX];
2047 	char psetname[ZS_PSETNAME_MAX];
2048 	pool_conf_t *conf = ctl->zsctl_pool_conf;
2049 	pool_value_t *pv_save = NULL;
2050 	pool_value_t **vals = ctl->zsctl_pool_vals;
2051 	const char *string;
2052 	int ret;
2053 	int64_t int64;
2054 	uint_t num;
2055 
2056 	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2057 	    &poolid, sizeof (poolid));
2058 	if (ret < 0)
2059 		goto lookup_done;
2060 
2061 	pv_save = vals[1];
2062 	vals[1] = NULL;
2063 	pools = NULL;
2064 	res_list = NULL;
2065 
2066 	/* Default values if lookup fails */
2067 	(void) strlcpy(poolname, "pool_default", sizeof (poolname));
2068 	(void) strlcpy(psetname, "pset_default", sizeof (poolname));
2069 	*cputype = ZS_CPUTYPE_DEFAULT_PSET;
2070 
2071 	/* no dedicated cpu if pools are disabled */
2072 	if (ctl->zsctl_pool_status == POOL_DISABLED)
2073 		goto lookup_done;
2074 
2075 	/* Get the pool name using the id */
2076 	pool_value_set_int64(vals[0], poolid);
2077 	if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2078 		goto lookup_done;
2079 
2080 	if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2081 		goto lookup_done;
2082 
2083 	if (num != 1)
2084 		goto lookup_done;
2085 
2086 	if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2087 	    "pool.name", vals[0]) != POC_STRING ||
2088 	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2089 		goto lookup_done;
2090 	(void) strlcpy(poolname, (char *)string, sizeof (poolname));
2091 
2092 	/* Get the name of the pset for the pool */
2093 	if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2094 		goto lookup_done;
2095 
2096 	if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2097 		goto lookup_done;
2098 
2099 	if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2100 	    == NULL)
2101 		goto lookup_done;
2102 
2103 	if (num != 1)
2104 		goto lookup_done;
2105 
2106 	if (pool_get_property(conf, pool_resource_to_elem(conf,
2107 	    res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2108 	    pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2109 		goto lookup_done;
2110 
2111 	if (int64 == ZS_PSET_DEFAULT)
2112 		*cputype = ZS_CPUTYPE_DEFAULT_PSET;
2113 
2114 	if (pool_get_property(conf, pool_resource_to_elem(conf,
2115 	    res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2116 	    pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2117 		goto lookup_done;
2118 
2119 	(void) strlcpy(psetname, (char *)string, sizeof (psetname));
2120 
2121 	if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2122 		*cputype = ZS_CPUTYPE_DEDICATED;
2123 	if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2124 		*cputype = ZS_CPUTYPE_PSRSET_PSET;
2125 	else
2126 		*cputype = ZS_CPUTYPE_POOL_PSET;
2127 
2128 lookup_done:
2129 
2130 	if (pv_save != NULL)
2131 		vals[1] = pv_save;
2132 
2133 	if (res_list)
2134 		free(res_list);
2135 	if (pools)
2136 		free(pools);
2137 
2138 	(void) strlcpy(pool, poolname, poollen);
2139 	(void) strlcpy(pset, psetname, psetlen);
2140 }
2141 
2142 /* Convert scheduler names to ZS_* scheduler flags */
2143 static uint_t
2144 zsd_schedname2int(char *clname, int pri)
2145 {
2146 	uint_t sched = 0;
2147 
2148 	if (strcmp(clname, "TS") == 0) {
2149 		sched = ZS_SCHED_TS;
2150 	} else if (strcmp(clname, "IA") == 0) {
2151 		sched = ZS_SCHED_IA;
2152 	} else if (strcmp(clname, "FX") == 0) {
2153 		if (pri > 59) {
2154 			sched = ZS_SCHED_FX_60;
2155 		} else {
2156 			sched = ZS_SCHED_FX;
2157 		}
2158 	} else if (strcmp(clname, "RT") == 0) {
2159 		sched = ZS_SCHED_RT;
2160 
2161 	} else if (strcmp(clname, "FSS") == 0) {
2162 		sched = ZS_SCHED_FSS;
2163 	}
2164 	return (sched);
2165 }
2166 
2167 static uint64_t
2168 zsd_get_zone_rctl_limit(char *name)
2169 {
2170 	rctlblk_t *rblk;
2171 
2172 	rblk = (rctlblk_t *)alloca(rctlblk_size());
2173 	if (getrctl(name, NULL, rblk, RCTL_FIRST)
2174 	    != 0) {
2175 		return (ZS_LIMIT_NONE);
2176 	}
2177 	return (rctlblk_get_value(rblk));
2178 }
2179 
2180 static uint64_t
2181 zsd_get_zone_rctl_usage(char *name)
2182 {
2183 	rctlblk_t *rblk;
2184 
2185 	rblk = (rctlblk_t *)alloca(rctlblk_size());
2186 	if (getrctl(name, NULL, rblk, RCTL_USAGE)
2187 	    != 0) {
2188 		return (0);
2189 	}
2190 	return (rctlblk_get_value(rblk));
2191 }
2192 
2193 #define	ZSD_NUM_RCTL_VALS 19
2194 
2195 /*
2196  * Fetch the limit information for a zone.  This uses zone_enter() as the
2197  * getrctl(2) system call only returns rctl information for the zone of
2198  * the caller.
2199  */
2200 static int
2201 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2202     uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2203     uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2204     uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2205     uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2206     uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2207     uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2208 {
2209 	int p[2], pid, tmpl_fd, ret;
2210 	ctid_t ct;
2211 	char class[PC_CLNMSZ];
2212 	uint64_t vals[ZSD_NUM_RCTL_VALS];
2213 	zsd_system_t *sys = ctl->zsctl_system;
2214 	int i = 0;
2215 	int res = 0;
2216 
2217 	/* Treat all caps as no cap on error */
2218 	*cpu_shares = ZS_LIMIT_NONE;
2219 	*cpu_cap = ZS_LIMIT_NONE;
2220 	*ram_cap = ZS_LIMIT_NONE;
2221 	*locked_cap = ZS_LIMIT_NONE;
2222 	*vm_cap = ZS_LIMIT_NONE;
2223 
2224 	*processes_cap = ZS_LIMIT_NONE;
2225 	*lwps_cap = ZS_LIMIT_NONE;
2226 	*shm_cap = ZS_LIMIT_NONE;
2227 	*shmids_cap = ZS_LIMIT_NONE;
2228 	*semids_cap = ZS_LIMIT_NONE;
2229 	*msgids_cap = ZS_LIMIT_NONE;
2230 	*lofi_cap = ZS_LIMIT_NONE;
2231 
2232 	*processes = 0;
2233 	*lwps = 0;
2234 	*shm = 0;
2235 	*shmids = 0;
2236 	*semids = 0;
2237 	*msgids = 0;
2238 	*lofi = 0;
2239 
2240 	/* Get the ram cap first since it is a zone attr */
2241 	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2242 	    ram_cap, sizeof (*ram_cap));
2243 	if (ret < 0 || *ram_cap == 0)
2244 		*ram_cap = ZS_LIMIT_NONE;
2245 
2246 	/* Get the zone's default scheduling class */
2247 	ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2248 	    class, sizeof (class));
2249 	if (ret < 0)
2250 		return (-1);
2251 
2252 	*sched = zsd_schedname2int(class, 0);
2253 
2254 	/* rctl caps must be fetched from within the zone */
2255 	if (pipe(p) != 0)
2256 		return (-1);
2257 
2258 	if ((tmpl_fd = init_template()) == -1) {
2259 		(void) close(p[0]);
2260 		(void) close(p[1]);
2261 		return (-1);
2262 	}
2263 	pid = forkx(0);
2264 	if (pid < 0) {
2265 		(void) ct_tmpl_clear(tmpl_fd);
2266 		(void) close(p[0]);
2267 		(void) close(p[1]);
2268 		return (-1);
2269 	}
2270 	if (pid == 0) {
2271 
2272 		(void) ct_tmpl_clear(tmpl_fd);
2273 		(void) close(tmpl_fd);
2274 		(void) close(p[0]);
2275 		if (zone->zsz_id != getzoneid()) {
2276 			if (zone_enter(zone->zsz_id) < 0) {
2277 				(void) close(p[1]);
2278 				_exit(0);
2279 			}
2280 		}
2281 
2282 		/* Get caps for zone, and write them to zonestatd parent. */
2283 		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2284 		vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2285 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2286 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2287 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2288 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2289 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2290 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2291 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2292 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2293 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2294 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2295 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2296 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2297 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2298 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2299 		vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2300 		vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2301 
2302 		if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2303 		    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2304 			(void) close(p[1]);
2305 			_exit(1);
2306 		}
2307 
2308 		(void) close(p[1]);
2309 		_exit(0);
2310 	}
2311 	if (contract_latest(&ct) == -1)
2312 		ct = -1;
2313 
2314 	(void) ct_tmpl_clear(tmpl_fd);
2315 	(void) close(tmpl_fd);
2316 	(void) close(p[1]);
2317 	while (waitpid(pid, NULL, 0) != pid)
2318 		;
2319 
2320 	/* Read cap from child in zone */
2321 	if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2322 	    ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2323 		res = -1;
2324 		goto cleanup;
2325 	}
2326 	i = 0;
2327 	*cpu_shares = vals[i++];
2328 	*cpu_cap = vals[i++];
2329 	*locked_cap = vals[i++];
2330 	*vm_cap = vals[i++];
2331 	*processes_cap = vals[i++];
2332 	*processes = vals[i++];
2333 	*lwps_cap = vals[i++];
2334 	*lwps = vals[i++];
2335 	*shm_cap = vals[i++];
2336 	*shm = vals[i++];
2337 	*shmids_cap = vals[i++];
2338 	*shmids = vals[i++];
2339 	*semids_cap = vals[i++];
2340 	*semids = vals[i++];
2341 	*msgids_cap = vals[i++];
2342 	*msgids = vals[i++];
2343 	*lofi_cap = vals[i++];
2344 	*lofi = vals[i++];
2345 
2346 	/* Interpret maximum values as no cap */
2347 	if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2348 		*cpu_cap = ZS_LIMIT_NONE;
2349 	if (*processes_cap == sys->zss_processes_max)
2350 		*processes_cap = ZS_LIMIT_NONE;
2351 	if (*lwps_cap == sys->zss_lwps_max)
2352 		*lwps_cap = ZS_LIMIT_NONE;
2353 	if (*shm_cap == sys->zss_shm_max)
2354 		*shm_cap = ZS_LIMIT_NONE;
2355 	if (*shmids_cap == sys->zss_shmids_max)
2356 		*shmids_cap = ZS_LIMIT_NONE;
2357 	if (*semids_cap == sys->zss_semids_max)
2358 		*semids_cap = ZS_LIMIT_NONE;
2359 	if (*msgids_cap == sys->zss_msgids_max)
2360 		*msgids_cap = ZS_LIMIT_NONE;
2361 	if (*lofi_cap == sys->zss_lofi_max)
2362 		*lofi_cap = ZS_LIMIT_NONE;
2363 
2364 
2365 cleanup:
2366 	(void) close(p[0]);
2367 	(void) ct_tmpl_clear(tmpl_fd);
2368 	(void) close(tmpl_fd);
2369 	(void) contract_abandon_id(ct);
2370 
2371 	return (res);
2372 }
2373 
2374 /* Update the current list of running zones */
2375 static void
2376 zsd_refresh_zones(zsd_ctl_t *ctl)
2377 {
2378 	zsd_zone_t *zone;
2379 	uint_t old, num;
2380 	ushort_t flags;
2381 	int i, ret;
2382 	zoneid_t *cache;
2383 	uint64_t cpu_shares;
2384 	uint64_t cpu_cap;
2385 	uint64_t ram_cap;
2386 	uint64_t locked_cap;
2387 	uint64_t vm_cap;
2388 	uint64_t processes_cap;
2389 	uint64_t processes;
2390 	uint64_t lwps_cap;
2391 	uint64_t lwps;
2392 	uint64_t shm_cap;
2393 	uint64_t shm;
2394 	uint64_t shmids_cap;
2395 	uint64_t shmids;
2396 	uint64_t semids_cap;
2397 	uint64_t semids;
2398 	uint64_t msgids_cap;
2399 	uint64_t msgids;
2400 	uint64_t lofi_cap;
2401 	uint64_t lofi;
2402 
2403 	char zonename[ZS_ZONENAME_MAX];
2404 	char poolname[ZS_POOLNAME_MAX];
2405 	char psetname[ZS_PSETNAME_MAX];
2406 	uint_t sched;
2407 	uint_t cputype;
2408 	uint_t iptype;
2409 
2410 	/* Get the current list of running zones */
2411 	for (;;) {
2412 		old = num = ctl->zsctl_zone_ncache;
2413 		(void) zone_list(ctl->zsctl_zone_cache, &num);
2414 		if (num <= old)
2415 			break;
2416 		if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2417 		    (num) * sizeof (zoneid_t))) != NULL) {
2418 			ctl->zsctl_zone_ncache = num;
2419 			ctl->zsctl_zone_cache = cache;
2420 		} else {
2421 			/* Could not allocate to get new zone list.  Give up */
2422 			return;
2423 		}
2424 	}
2425 
2426 	zsd_mark_zones_start(ctl);
2427 
2428 	for (i = 0; i < num; i++) {
2429 
2430 		ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2431 		    zonename, sizeof (zonename));
2432 		if (ret < 0)
2433 			continue;
2434 
2435 		zone = zsd_lookup_insert_zone(ctl, zonename,
2436 		    ctl->zsctl_zone_cache[i]);
2437 
2438 		ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2439 		    &flags, sizeof (flags));
2440 		if (ret < 0)
2441 			continue;
2442 
2443 		if (flags & ZF_NET_EXCL)
2444 			iptype = ZS_IPTYPE_EXCLUSIVE;
2445 		else
2446 			iptype = ZS_IPTYPE_SHARED;
2447 
2448 		zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2449 		    psetname, sizeof (psetname), &cputype);
2450 
2451 		if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2452 		    &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2453 		    &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2454 		    &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2455 		    &lofi, &sched) != 0)
2456 			continue;
2457 
2458 		zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2459 		    locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2460 		    lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2461 		    semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2462 		    psetname, sched, cputype, iptype);
2463 	}
2464 }
2465 
2466 /* Fetch the details of a process from its psinfo_t */
2467 static void
2468 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2469     psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2470     timestruc_t *delta, uint_t *sched)
2471 {
2472 	timestruc_t d;
2473 	zsd_proc_t *proc;
2474 
2475 	/* Get cached data for proc */
2476 	proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2477 	*psetid = psinfo->pr_lwp.pr_bindpset;
2478 
2479 	if (proc->zspr_psetid == ZS_PSET_ERROR)
2480 		*prev_psetid = *psetid;
2481 	else
2482 		*prev_psetid = proc->zspr_psetid;
2483 
2484 	*zoneid = psinfo->pr_zoneid;
2485 	if (proc->zspr_zoneid == -1)
2486 		*prev_zoneid = *zoneid;
2487 	else
2488 		*prev_zoneid = proc->zspr_zoneid;
2489 
2490 	TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2491 	*delta = d;
2492 
2493 	*sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2494 	    psinfo->pr_lwp.pr_pri);
2495 
2496 	/* Update cached data for proc */
2497 	proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2498 	proc->zspr_zoneid = psinfo->pr_zoneid;
2499 	proc->zspr_sched = *sched;
2500 	proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2501 	proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2502 	proc->zspr_ppid = psinfo->pr_ppid;
2503 }
2504 
2505 /*
2506  * Reset the known cpu usage of a process. This is done after a process
2507  * exits so that if the pid is recycled, data from its previous life is
2508  * not reused
2509  */
2510 static void
2511 zsd_flush_proc_info(zsd_proc_t *proc)
2512 {
2513 	proc->zspr_usage.tv_sec = 0;
2514 	proc->zspr_usage.tv_nsec = 0;
2515 }
2516 
2517 /*
2518  * Open the current extended accounting file.  On initialization, open the
2519  * file as the current file to be used.  Otherwise, open the file as the
2520  * next file to use of the current file reaches EOF.
2521  */
2522 static int
2523 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2524 {
2525 	int ret, oret, state, trys = 0, flags;
2526 	int *fd, *open;
2527 	ea_file_t *eaf;
2528 	struct stat64 *stat;
2529 	char path[MAXPATHLEN];
2530 
2531 	/*
2532 	 * The accounting file is first opened at the tail.  Following
2533 	 * opens to new accounting files are opened at the head.
2534 	 */
2535 	if (init == B_TRUE) {
2536 		flags = EO_NO_VALID_HDR | EO_TAIL;
2537 		fd = &ctl->zsctl_proc_fd;
2538 		eaf = &ctl->zsctl_proc_eaf;
2539 		stat = &ctl->zsctl_proc_stat;
2540 		open = &ctl->zsctl_proc_open;
2541 	} else {
2542 		flags = EO_NO_VALID_HDR | EO_HEAD;
2543 		fd = &ctl->zsctl_proc_fd_next;
2544 		eaf = &ctl->zsctl_proc_eaf_next;
2545 		stat = &ctl->zsctl_proc_stat_next;
2546 		open = &ctl->zsctl_proc_open_next;
2547 	}
2548 
2549 	*fd = -1;
2550 	*open = 0;
2551 retry:
2552 	/* open accounting files for cpu consumption */
2553 	ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2554 	if (ret != 0) {
2555 		zsd_warn(gettext("Unable to get process accounting state"));
2556 		goto err;
2557 	}
2558 	if (state != AC_ON) {
2559 		if (trys > 0) {
2560 			zsd_warn(gettext(
2561 			    "Unable to enable process accounting"));
2562 			goto err;
2563 		}
2564 		(void) zsd_enable_cpu_stats();
2565 		trys++;
2566 		goto retry;
2567 	}
2568 
2569 	ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2570 	if (ret != 0) {
2571 		zsd_warn(gettext("Unable to get process accounting file"));
2572 		goto err;
2573 	}
2574 
2575 	if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2576 	    (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2577 		ret = fstat64(*fd, stat);
2578 
2579 	if (*fd < 0 || oret < 0 || ret < 0) {
2580 		struct timespec ts;
2581 
2582 		/*
2583 		 * It is possible the accounting file is momentarily unavailable
2584 		 * because it is being rolled.  Try for up to half a second.
2585 		 *
2586 		 * If failure to open accounting file persists, give up.
2587 		 */
2588 		if (oret == 0)
2589 			(void) ea_close(eaf);
2590 		else if (*fd >= 0)
2591 			(void) close(*fd);
2592 		if (trys > 500) {
2593 			zsd_warn(gettext(
2594 			    "Unable to open process accounting file"));
2595 			goto err;
2596 		}
2597 		/* wait one millisecond */
2598 		ts.tv_sec = 0;
2599 		ts.tv_nsec = NANOSEC / 1000;
2600 		(void) nanosleep(&ts, NULL);
2601 		goto retry;
2602 	}
2603 	*open = 1;
2604 	return (0);
2605 err:
2606 	if (*fd >= 0)
2607 		(void) close(*fd);
2608 	*open = 0;
2609 	*fd = -1;
2610 	return (-1);
2611 }
2612 
2613 /*
2614  * Walk /proc and charge each process to its zone and processor set.
2615  * Then read exacct data for exited processes, and charge them as well.
2616  */
2617 static void
2618 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2619 {
2620 	DIR *dir;
2621 	struct dirent *dent;
2622 	psinfo_t psinfo;
2623 	int fd, ret;
2624 	zsd_proc_t *proc, *pproc, *tmp, *next;
2625 	list_t pplist, plist;
2626 	zsd_zone_t *zone, *prev_zone;
2627 	zsd_pset_t *pset, *prev_pset;
2628 	psetid_t psetid, prev_psetid;
2629 	zoneid_t zoneid, prev_zoneid;
2630 	zsd_pset_usage_t *usage, *prev_usage;
2631 	char path[MAXPATHLEN];
2632 
2633 	ea_object_t object;
2634 	ea_object_t pobject;
2635 	boolean_t hrtime_expired = B_FALSE;
2636 	struct timeval interval_end;
2637 
2638 	timestruc_t delta, d1, d2;
2639 	uint_t sched = 0;
2640 
2641 	/*
2642 	 * Get the current accounting file.  The current accounting file
2643 	 * may be different than the file in use, as the accounting file
2644 	 * may have been rolled, or manually changed by an admin.
2645 	 */
2646 	ret = zsd_open_exacct(ctl, init);
2647 	if (ret != 0) {
2648 		zsd_warn(gettext("Unable to track process accounting"));
2649 		return;
2650 	}
2651 
2652 	/*
2653 	 * Mark the current time as the interval end time.  Don't track
2654 	 * processes that exit after this time.
2655 	 */
2656 	(void) gettimeofday(&interval_end, NULL);
2657 
2658 	dir = opendir("/proc");
2659 	if (dir == NULL) {
2660 		zsd_warn(gettext("Unable to open /proc"));
2661 		return;
2662 	}
2663 
2664 	dent = ctl->zsctl_procfs_dent;
2665 
2666 	(void) memset(dent, 0, ctl->zsctl_procfs_dent_size);
2667 
2668 	/* Walk all processes and compute each zone's usage on each pset. */
2669 	while (readdir_r(dir, dent) != 0) {
2670 
2671 		if (strcmp(dent->d_name, ".") == 0 ||
2672 		    strcmp(dent->d_name, "..") == 0)
2673 			continue;
2674 
2675 		(void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2676 		    dent->d_name);
2677 
2678 		fd = open(path, O_RDONLY);
2679 		if (fd < 0)
2680 			continue;
2681 
2682 		if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2683 			(void) close(fd);
2684 			continue;
2685 		}
2686 		(void) close(fd);
2687 
2688 		zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2689 		    &zoneid, &prev_zoneid, &delta, &sched);
2690 
2691 		d1.tv_sec = delta.tv_sec / 2;
2692 		d1.tv_nsec = delta.tv_nsec / 2;
2693 		d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2694 		d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2695 
2696 		/* Get the zone and pset this process is running in */
2697 		zone = zsd_lookup_zone_byid(ctl, zoneid);
2698 		if (zone == NULL)
2699 			continue;
2700 		pset = zsd_lookup_pset_byid(ctl, psetid);
2701 		if (pset == NULL)
2702 			continue;
2703 		usage = zsd_lookup_insert_usage(ctl, pset, zone);
2704 		if (usage == NULL)
2705 			continue;
2706 
2707 		/*
2708 		 * Get the usage of the previous zone and pset if they were
2709 		 * different.
2710 		 */
2711 		if (zoneid != prev_zoneid)
2712 			prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2713 		else
2714 			prev_zone = NULL;
2715 
2716 		if (psetid != prev_psetid)
2717 			prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2718 		else
2719 			prev_pset = NULL;
2720 
2721 		prev_usage = NULL;
2722 		if (prev_zone != NULL || prev_pset != NULL) {
2723 			if (prev_zone == NULL)
2724 				prev_zone = zone;
2725 			if (prev_pset == NULL)
2726 				prev_pset = pset;
2727 
2728 			prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2729 			    prev_zone);
2730 		}
2731 
2732 		/* Update the usage with the processes info */
2733 		if (prev_usage == NULL) {
2734 			zsd_mark_pset_usage_found(usage, sched);
2735 		} else {
2736 			zsd_mark_pset_usage_found(usage, sched);
2737 			zsd_mark_pset_usage_found(prev_usage, sched);
2738 		}
2739 
2740 		/*
2741 		 * First time around is just to get a starting point.  All
2742 		 * usages will be zero.
2743 		 */
2744 		if (init == B_TRUE)
2745 			continue;
2746 
2747 		if (prev_usage == NULL) {
2748 			zsd_add_usage(ctl, usage, &delta);
2749 		} else {
2750 			zsd_add_usage(ctl, usage, &d1);
2751 			zsd_add_usage(ctl, prev_usage, &d2);
2752 		}
2753 	}
2754 	(void) closedir(dir);
2755 
2756 	/*
2757 	 * No need to collect exited proc data on initialization.  Just
2758 	 * caching the usage of the known processes to get a zero starting
2759 	 * point.
2760 	 */
2761 	if (init == B_TRUE)
2762 		return;
2763 
2764 	/*
2765 	 * Add accounting records to account for processes which have
2766 	 * exited.
2767 	 */
2768 	list_create(&plist, sizeof (zsd_proc_t),
2769 	    offsetof(zsd_proc_t, zspr_next));
2770 	list_create(&pplist, sizeof (zsd_proc_t),
2771 	    offsetof(zsd_proc_t, zspr_next));
2772 
2773 	for (;;) {
2774 		pid_t pid;
2775 		pid_t ppid;
2776 		timestruc_t user, sys, proc_usage;
2777 		timestruc_t finish;
2778 		int numfound = 0;
2779 
2780 		bzero(&object, sizeof (object));
2781 		proc = NULL;
2782 		zone = NULL;
2783 		pset = NULL;
2784 		usage = NULL;
2785 		ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2786 		if (ret == EO_ERROR) {
2787 			if (ea_error() == EXR_EOF) {
2788 
2789 				struct stat64 *stat;
2790 				struct stat64 *stat_next;
2791 
2792 				/*
2793 				 * See if the next accounting file is the
2794 				 * same as the current accounting file.
2795 				 */
2796 				stat = &(ctl->zsctl_proc_stat);
2797 				stat_next = &(ctl->zsctl_proc_stat_next);
2798 				if (stat->st_ino == stat_next->st_ino &&
2799 				    stat->st_dev == stat_next->st_dev) {
2800 					/*
2801 					 * End of current accounting file is
2802 					 * reached, so finished.  Clear EOF
2803 					 * bit for next time around.
2804 					 */
2805 					ea_clear(&ctl->zsctl_proc_eaf);
2806 					break;
2807 				} else {
2808 					/*
2809 					 * Accounting file has changed.  Move
2810 					 * to current accounting file.
2811 					 */
2812 					(void) ea_close(&ctl->zsctl_proc_eaf);
2813 
2814 					ctl->zsctl_proc_fd =
2815 					    ctl->zsctl_proc_fd_next;
2816 					ctl->zsctl_proc_eaf =
2817 					    ctl->zsctl_proc_eaf_next;
2818 					ctl->zsctl_proc_stat =
2819 					    ctl->zsctl_proc_stat_next;
2820 
2821 					ctl->zsctl_proc_fd_next = -1;
2822 					ctl->zsctl_proc_open_next = 0;
2823 					continue;
2824 				}
2825 			} else {
2826 				/*
2827 				 * Other accounting error.  Give up on
2828 				 * accounting.
2829 				 */
2830 				goto ea_err;
2831 			}
2832 		}
2833 		/* Skip if not a process group */
2834 		if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2835 		    (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2836 			(void) ea_free_item(&object, EUP_ALLOC);
2837 			continue;
2838 		}
2839 
2840 		/* The process group entry should be complete */
2841 		while (numfound < 9) {
2842 			bzero(&pobject, sizeof (pobject));
2843 			ret = ea_get_object(&ctl->zsctl_proc_eaf,
2844 			    &pobject);
2845 			if (ret < 0) {
2846 				(void) ea_free_item(&object, EUP_ALLOC);
2847 				zsd_warn(
2848 				    "unable to get process accounting data");
2849 				goto ea_err;
2850 			}
2851 			/* Next entries should be process data */
2852 			if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2853 			    EXT_GROUP) {
2854 				(void) ea_free_item(&object, EUP_ALLOC);
2855 				(void) ea_free_item(&pobject, EUP_ALLOC);
2856 				zsd_warn(
2857 				    "process data of wrong type");
2858 				goto ea_err;
2859 			}
2860 			switch (pobject.eo_catalog & EXD_DATA_MASK) {
2861 			case EXD_PROC_PID:
2862 				pid = pobject.eo_item.ei_uint32;
2863 				proc = &(ctl->zsctl_proc_array[pid]);
2864 				/*
2865 				 * This process should not be currently in
2866 				 * the list of processes to process.
2867 				 */
2868 				assert(!list_link_active(&proc->zspr_next));
2869 				numfound++;
2870 				break;
2871 			case EXD_PROC_ANCPID:
2872 				ppid = pobject.eo_item.ei_uint32;
2873 				pproc = &(ctl->zsctl_proc_array[ppid]);
2874 				numfound++;
2875 				break;
2876 			case EXD_PROC_ZONENAME:
2877 				zone = zsd_lookup_zone(ctl,
2878 				    pobject.eo_item.ei_string, -1);
2879 				numfound++;
2880 				break;
2881 			case EXD_PROC_CPU_USER_SEC:
2882 				user.tv_sec =
2883 				    pobject.eo_item.ei_uint64;
2884 				numfound++;
2885 				break;
2886 			case EXD_PROC_CPU_USER_NSEC:
2887 				user.tv_nsec =
2888 				    pobject.eo_item.ei_uint64;
2889 				numfound++;
2890 				break;
2891 			case EXD_PROC_CPU_SYS_SEC:
2892 				sys.tv_sec =
2893 				    pobject.eo_item.ei_uint64;
2894 				numfound++;
2895 				break;
2896 			case EXD_PROC_CPU_SYS_NSEC:
2897 				sys.tv_nsec =
2898 				    pobject.eo_item.ei_uint64;
2899 				numfound++;
2900 				break;
2901 			case EXD_PROC_FINISH_SEC:
2902 				finish.tv_sec =
2903 				    pobject.eo_item.ei_uint64;
2904 				numfound++;
2905 				break;
2906 			case EXD_PROC_FINISH_NSEC:
2907 				finish.tv_nsec =
2908 				    pobject.eo_item.ei_uint64;
2909 				numfound++;
2910 				break;
2911 			}
2912 			(void) ea_free_item(&pobject, EUP_ALLOC);
2913 		}
2914 		(void) ea_free_item(&object, EUP_ALLOC);
2915 		if (numfound != 9) {
2916 			zsd_warn(gettext(
2917 			    "Malformed process accounting entry found"));
2918 			goto proc_done;
2919 		}
2920 
2921 		if (finish.tv_sec > interval_end.tv_sec ||
2922 		    (finish.tv_sec == interval_end.tv_sec &&
2923 		    finish.tv_nsec > (interval_end.tv_usec * 1000)))
2924 			hrtime_expired = B_TRUE;
2925 
2926 		/*
2927 		 * Try to identify the zone and pset to which this
2928 		 * exited process belongs.
2929 		 */
2930 		if (zone == NULL)
2931 			goto proc_done;
2932 
2933 		/* Save proc info */
2934 		proc->zspr_ppid = ppid;
2935 		proc->zspr_zoneid = zone->zsz_id;
2936 
2937 		prev_psetid = ZS_PSET_ERROR;
2938 		sched = 0;
2939 
2940 		/*
2941 		 * The following tries to deduce the processes pset.
2942 		 *
2943 		 * First choose pset and sched using cached value from the
2944 		 * most recent time the process has been seen.
2945 		 *
2946 		 * pset and sched can change across zone_enter, so make sure
2947 		 * most recent sighting of this process was in the same
2948 		 * zone before using most recent known value.
2949 		 *
2950 		 * If there is no known value, use value of processes
2951 		 * parent.  If parent is unknown, walk parents until a known
2952 		 * parent is found.
2953 		 *
2954 		 * If no parent in the zone is found, use the zone's default
2955 		 * pset and scheduling class.
2956 		 */
2957 		if (proc->zspr_psetid != ZS_PSET_ERROR) {
2958 			prev_psetid = proc->zspr_psetid;
2959 			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2960 			sched = proc->zspr_sched;
2961 		} else if (pproc->zspr_zoneid == zone->zsz_id &&
2962 		    pproc->zspr_psetid != ZS_PSET_ERROR) {
2963 			prev_psetid = pproc->zspr_psetid;
2964 			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2965 			sched = pproc->zspr_sched;
2966 		}
2967 
2968 		if (pset == NULL) {
2969 			/*
2970 			 * Process or processes parent has never been seen.
2971 			 * Save to deduce a known parent later.
2972 			 */
2973 			proc_usage = sys;
2974 			TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2975 			TIMESTRUC_DELTA(delta, proc_usage,
2976 			    proc->zspr_usage);
2977 			proc->zspr_usage = delta;
2978 			list_insert_tail(&plist, proc);
2979 			continue;
2980 		}
2981 
2982 		/* Add the zone's usage to the pset */
2983 		usage = zsd_lookup_insert_usage(ctl, pset, zone);
2984 		if (usage == NULL)
2985 			goto proc_done;
2986 
2987 		zsd_mark_pset_usage_found(usage, sched);
2988 
2989 		/* compute the usage to add for the exited proc */
2990 		proc_usage = sys;
2991 		TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2992 		TIMESTRUC_DELTA(delta, proc_usage,
2993 		    proc->zspr_usage);
2994 
2995 		zsd_add_usage(ctl, usage, &delta);
2996 proc_done:
2997 		zsd_flush_proc_info(proc);
2998 
2999 		if (hrtime_expired == B_TRUE)
3000 			break;
3001 	}
3002 	/*
3003 	 * close next accounting file.
3004 	 */
3005 	if (ctl->zsctl_proc_open_next) {
3006 		(void) ea_close(
3007 		    &ctl->zsctl_proc_eaf_next);
3008 		ctl->zsctl_proc_open_next = 0;
3009 		ctl->zsctl_proc_fd_next = -1;
3010 	}
3011 
3012 	/* For the remaining processes, use pset and sched of a known parent */
3013 	proc = list_head(&plist);
3014 	while (proc != NULL) {
3015 		next = proc;
3016 		for (;;) {
3017 			if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3018 				/*
3019 				 * Kernel process, or parent is unknown, skip
3020 				 * process, remove from process list.
3021 				 */
3022 				tmp = proc;
3023 				proc = list_next(&plist, proc);
3024 				list_link_init(&tmp->zspr_next);
3025 				break;
3026 			}
3027 			pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3028 			if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3029 				/*
3030 				 * Parent in different zone.  Save process and
3031 				 * use zone's default pset and sched below
3032 				 */
3033 				tmp = proc;
3034 				proc = list_next(&plist, proc);
3035 				list_remove(&plist, tmp);
3036 				list_insert_tail(&pplist, tmp);
3037 				break;
3038 			}
3039 			/* Parent has unknown pset, Search parent's parent  */
3040 			if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3041 				next = pproc;
3042 				continue;
3043 			}
3044 			/* Found parent with known pset.  Use its info */
3045 			proc->zspr_psetid = pproc->zspr_psetid;
3046 			proc->zspr_sched = pproc->zspr_sched;
3047 			next->zspr_psetid = pproc->zspr_psetid;
3048 			next->zspr_sched = pproc->zspr_sched;
3049 			zone = zsd_lookup_zone_byid(ctl,
3050 			    proc->zspr_zoneid);
3051 			if (zone == NULL) {
3052 				tmp = proc;
3053 				proc = list_next(&plist, proc);
3054 				list_remove(&plist, tmp);
3055 				list_link_init(&tmp->zspr_next);
3056 				break;
3057 			}
3058 			pset = zsd_lookup_pset_byid(ctl,
3059 			    proc->zspr_psetid);
3060 			if (pset == NULL) {
3061 				tmp = proc;
3062 				proc = list_next(&plist, proc);
3063 				list_remove(&plist, tmp);
3064 				list_link_init(&tmp->zspr_next);
3065 				break;
3066 			}
3067 			/* Add the zone's usage to the pset */
3068 			usage = zsd_lookup_insert_usage(ctl, pset, zone);
3069 			if (usage == NULL) {
3070 				tmp = proc;
3071 				proc = list_next(&plist, proc);
3072 				list_remove(&plist, tmp);
3073 				list_link_init(&tmp->zspr_next);
3074 				break;
3075 			}
3076 			zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3077 			zsd_add_usage(ctl, usage, &proc->zspr_usage);
3078 			zsd_flush_proc_info(proc);
3079 			tmp = proc;
3080 			proc = list_next(&plist, proc);
3081 			list_remove(&plist, tmp);
3082 			list_link_init(&tmp->zspr_next);
3083 			break;
3084 		}
3085 	}
3086 	/*
3087 	 * Process has never been seen.  Using zone info to
3088 	 * determine pset and scheduling class.
3089 	 */
3090 	proc = list_head(&pplist);
3091 	while (proc != NULL) {
3092 
3093 		zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3094 		if (zone == NULL)
3095 			goto next;
3096 		if (zone->zsz_psetid != ZS_PSET_ERROR &&
3097 		    zone->zsz_psetid != ZS_PSET_MULTI) {
3098 			prev_psetid = zone->zsz_psetid;
3099 			pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3100 		} else {
3101 			pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3102 			if (pset != NULL)
3103 				prev_psetid = pset->zsp_id;
3104 		}
3105 		if (pset == NULL)
3106 			goto next;
3107 
3108 		sched = zone->zsz_scheds;
3109 		/*
3110 		 * Ignore FX high scheduling class if it is not the
3111 		 * only scheduling class in the zone.
3112 		 */
3113 		if (sched != ZS_SCHED_FX_60)
3114 			sched &= (~ZS_SCHED_FX_60);
3115 		/*
3116 		 * If more than one scheduling class has been found
3117 		 * in the zone, use zone's default scheduling class for
3118 		 * this process.
3119 		 */
3120 		if ((sched & (sched - 1)) != 0)
3121 			sched = zone->zsz_default_sched;
3122 
3123 		/* Add the zone's usage to the pset */
3124 		usage = zsd_lookup_insert_usage(ctl, pset, zone);
3125 		if (usage == NULL)
3126 			goto next;
3127 
3128 		zsd_mark_pset_usage_found(usage, sched);
3129 		zsd_add_usage(ctl, usage, &proc->zspr_usage);
3130 next:
3131 		tmp = proc;
3132 		proc = list_next(&pplist, proc);
3133 		zsd_flush_proc_info(tmp);
3134 		list_link_init(&tmp->zspr_next);
3135 	}
3136 	return;
3137 ea_err:
3138 	/*
3139 	 * Close the next accounting file if we have not transitioned to it
3140 	 * yet.
3141 	 */
3142 	if (ctl->zsctl_proc_open_next) {
3143 		(void) ea_close(&ctl->zsctl_proc_eaf_next);
3144 		ctl->zsctl_proc_open_next = 0;
3145 		ctl->zsctl_proc_fd_next = -1;
3146 	}
3147 }
3148 
3149 /*
3150  * getvmusage(2) uses size_t's in the passwd data structure, which differ
3151  * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
3152  * and zonestatd does not necessarily match the kernel's bitness, marshal
3153  * results appropriately.
3154  */
3155 static int
3156 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3157     uint64_t *nres)
3158 {
3159 	zsd_vmusage32_t *vmu32;
3160 	zsd_vmusage64_t *vmu64;
3161 	uint32_t nres32;
3162 	int i;
3163 	int ret;
3164 
3165 	if (ctl->zsctl_kern_bits == 32)  {
3166 		nres32 = *nres;
3167 		ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3168 		    flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3169 		*nres = nres32;
3170 		if (ret == 0 && buf != NULL) {
3171 			/*
3172 			 * An array of vmusage32_t's has been returned.
3173 			 * Convert it to an array of vmusage64_t's.
3174 			 */
3175 			vmu32 = (zsd_vmusage32_t *)buf;
3176 			vmu64 = (zsd_vmusage64_t *)buf;
3177 			for (i = nres32 - 1; i >= 0; i--) {
3178 
3179 				vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3180 				vmu64[i].vmu_type = vmu32[i].vmu_type;
3181 				vmu64[i].vmu_type = vmu32[i].vmu_type;
3182 				vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3183 				vmu64[i].vmu_rss_private =
3184 				    vmu32[i].vmu_rss_private;
3185 				vmu64[i].vmu_rss_shared =
3186 				    vmu32[i].vmu_rss_shared;
3187 				vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3188 				vmu64[i].vmu_swap_private =
3189 				    vmu32[i].vmu_swap_private;
3190 				vmu64[i].vmu_swap_shared =
3191 				    vmu32[i].vmu_swap_shared;
3192 			}
3193 		}
3194 		return (ret);
3195 	} else {
3196 		/*
3197 		 * kernel is 64 bit, so use 64 bit structures as zonestat
3198 		 * expects.
3199 		 */
3200 		return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3201 		    flags, age, (uintptr_t)buf, (