xref: /illumos-gate/usr/src/uts/common/vm/vm_usage.c (revision c6f039c7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * vm_usage
29  *
30  * This file implements the getvmusage() private system call.
31  * getvmusage() counts the amount of resident memory pages and swap
32  * reserved by the specified process collective. A "process collective" is
33  * the set of processes owned by a particular, zone, project, task, or user.
34  *
35  * rss and swap are counted so that for a given process collective, a page is
36  * only counted once.  For example, this means that if multiple processes in
37  * the same project map the same page, then the project will only be charged
38  * once for that page.  On the other hand, if two processes in different
39  * projects map the same page, then both projects will be charged
40  * for the page.
41  *
42  * The vm_getusage() calculation is implemented so that the first thread
43  * performs the rss/swap counting. Other callers will wait for that thread to
44  * finish, copying the results.  This enables multiple rcapds and prstats to
45  * consume data from the same calculation.  The results are also cached so that
46  * a caller interested in recent results can just copy them instead of starting
47  * a new calculation. The caller passes the maximium age (in seconds) of the
48  * data.  If the cached data is young enough, the cache is copied, otherwise,
49  * a new calculation is executed and the cache is replaced with the new
50  * data.
51  *
52  * The rss calculation for each process collective is as follows:
53  *
54  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
55  *     and/or users.
56  *   - For each proc:
57  *	- Figure out proc's collectives (zone, project, task, and/or user).
58  *	- For each seg in proc's address space:
59  *		- If seg is private:
60  *			- Lookup anons in the amp.
61  *			- For incore pages not previously visited each of the
62  *			  proc's collectives, add incore pagesize to each.
63  *			  collective.
64  *			  Anon's with a refcnt of 1 can be assummed to be not
65  *			  previously visited.
66  *			- For address ranges without anons in the amp:
67  *				- Lookup pages in underlying vnode.
68  *				- For incore pages not previously visiting for
69  *				  each of the proc's collectives, add incore
70  *				  pagesize to each collective.
71  *		- If seg is shared:
72  *			- Lookup pages in the shared amp or vnode.
73  *			- For incore pages not previously visited for each of
74  *			  the proc's collectives, add incore pagesize to each
75  *			  collective.
76  *
77  * Swap is reserved by private segments, and shared anonymous segments.
78  * The only shared anon segments which do not reserve swap are ISM segments
79  * and schedctl segments, both of which can be identified by having
80  * amp->swresv == 0.
81  *
82  * The swap calculation for each collective is as follows:
83  *
84  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
85  *     and/or users.
86  *   - For each proc:
87  *	- Figure out proc's collectives (zone, project, task, and/or user).
88  *	- For each seg in proc's address space:
89  *		- If seg is private:
90  *			- Add svd->swresv pages to swap count for each of the
91  *			  proc's collectives.
92  *		- If seg is anon, shared, and amp->swresv != 0
93  *			- For address ranges in amp not previously visited for
94  *			  each of the proc's collectives, add size of address
95  *			  range to the swap count for each collective.
96  *
97  * These two calculations are done simultaneously, with most of the work
98  * being done in vmu_calculate_seg().  The results of the calculation are
99  * copied into "vmu_data.vmu_cache_results".
100  *
101  * To perform the calculation, various things are tracked and cached:
102  *
103  *    - incore/not-incore page ranges for all vnodes.
104  *	(vmu_data.vmu_all_vnodes_hash)
105  *	This eliminates looking up the same page more than once.
106  *
107  *    - incore/not-incore page ranges for all shared amps.
108  *	(vmu_data.vmu_all_amps_hash)
109  *	This eliminates looking up the same page more than once.
110  *
111  *    - visited page ranges for each collective.
112  *	   - per vnode (entity->vme_vnode_hash)
113  *	   - per shared amp (entity->vme_amp_hash)
114  *	For accurate counting of map-shared and COW-shared pages.
115  *
116  *    - visited private anons (refcnt > 1) for each collective.
117  *	(entity->vme_anon_hash)
118  *	For accurate counting of COW-shared pages.
119  *
120  * The common accounting structure is the vmu_entity_t, which represents
121  * collectives:
122  *
123  *    - A zone.
124  *    - A project, task, or user within a zone.
125  *    - The entire system (vmu_data.vmu_system).
126  *    - Each collapsed (col) project and user.  This means a given projid or
127  *	uid, regardless of which zone the process is in.  For instance,
128  *      project 0 in the global zone and project 0 in a non global zone are
129  *	the same collapsed project.
130  *
131  *  Each entity structure tracks which pages have been already visited for
132  *  that entity (via previously inspected processes) so that these pages are
133  *  not double counted.
134  */
135 
136 #include <sys/errno.h>
137 #include <sys/types.h>
138 #include <sys/zone.h>
139 #include <sys/proc.h>
140 #include <sys/project.h>
141 #include <sys/task.h>
142 #include <sys/thread.h>
143 #include <sys/time.h>
144 #include <sys/mman.h>
145 #include <sys/modhash.h>
146 #include <sys/modhash_impl.h>
147 #include <sys/shm.h>
148 #include <sys/swap.h>
149 #include <sys/synch.h>
150 #include <sys/systm.h>
151 #include <sys/var.h>
152 #include <sys/vm_usage.h>
153 #include <sys/zone.h>
154 #include <sys/sunddi.h>
155 #include <sys/avl.h>
156 #include <vm/anon.h>
157 #include <vm/as.h>
158 #include <vm/seg_vn.h>
159 #include <vm/seg_spt.h>
160 
161 #define	VMUSAGE_HASH_SIZE		512
162 
163 #define	VMUSAGE_TYPE_VNODE		1
164 #define	VMUSAGE_TYPE_AMP		2
165 #define	VMUSAGE_TYPE_ANON		3
166 
167 #define	VMUSAGE_BOUND_UNKNOWN		0
168 #define	VMUSAGE_BOUND_INCORE		1
169 #define	VMUSAGE_BOUND_NOT_INCORE	2
170 
171 #define	ISWITHIN(node, addr)	((node)->vmb_start <= addr && \
172 				    (node)->vmb_end >= addr ? 1 : 0)
173 
174 /*
175  * bounds for vnodes and shared amps
176  * Each bound is either entirely incore, entirely not in core, or
177  * entirely unknown.  bounds are stored in an avl tree sorted by start member
178  * when in use, otherwise (free or temporary lists) they're strung
179  * together off of vmb_next.
180  */
181 typedef struct vmu_bound {
182 	avl_node_t vmb_node;
183 	struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
184 	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
185 	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
186 	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
187 } vmu_bound_t;
188 
189 /*
190  * hash of visited objects (vnodes or shared amps)
191  * key is address of vnode or amp.  Bounds lists known incore/non-incore
192  * bounds for vnode/amp.
193  */
194 typedef struct vmu_object {
195 	struct vmu_object	*vmo_next;	/* free list */
196 	caddr_t		vmo_key;
197 	short		vmo_type;
198 	avl_tree_t	vmo_bounds;
199 } vmu_object_t;
200 
201 /*
202  * Entity by which to count results.
203  *
204  * The entity structure keeps the current rss/swap counts for each entity
205  * (zone, project, etc), and hashes of vm structures that have already
206  * been visited for the entity.
207  *
208  * vme_next:	links the list of all entities currently being counted by
209  *		vmu_calculate().
210  *
211  * vme_next_calc: links the list of entities related to the current process
212  *		 being counted by vmu_calculate_proc().
213  *
214  * vmu_calculate_proc() walks all processes.  For each process, it makes a
215  * list of the entities related to that process using vme_next_calc.  This
216  * list changes each time vmu_calculate_proc() is called.
217  *
218  */
219 typedef struct vmu_entity {
220 	struct vmu_entity *vme_next;
221 	struct vmu_entity *vme_next_calc;
222 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
223 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
224 	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
225 	vmusage_t	vme_result;	 /* identifies entity and results */
226 } vmu_entity_t;
227 
228 /*
229  * Hash of entities visited within a zone, and an entity for the zone
230  * itself.
231  */
232 typedef struct vmu_zone {
233 	struct vmu_zone	*vmz_next;	/* free list */
234 	id_t		vmz_id;
235 	vmu_entity_t	*vmz_zone;
236 	mod_hash_t	*vmz_projects_hash;
237 	mod_hash_t	*vmz_tasks_hash;
238 	mod_hash_t	*vmz_rusers_hash;
239 	mod_hash_t	*vmz_eusers_hash;
240 } vmu_zone_t;
241 
242 /*
243  * Cache of results from last calculation
244  */
245 typedef struct vmu_cache {
246 	vmusage_t	*vmc_results;	/* Results from last call to */
247 					/* vm_getusage(). */
248 	uint64_t	vmc_nresults;	/* Count of cached results */
249 	uint64_t	vmc_refcnt;	/* refcnt for free */
250 	uint_t		vmc_flags;	/* Flags for vm_getusage() */
251 	hrtime_t	vmc_timestamp;	/* when cache was created */
252 } vmu_cache_t;
253 
254 /*
255  * top level rss info for the system
256  */
257 typedef struct vmu_data {
258 	kmutex_t	vmu_lock;		/* Protects vmu_data */
259 	kcondvar_t	vmu_cv;			/* Used to signal threads */
260 						/* Waiting for */
261 						/* Rss_calc_thread to finish */
262 	vmu_entity_t	*vmu_system;		/* Entity for tracking */
263 						/* rss/swap for all processes */
264 						/* in all zones */
265 	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
266 	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
267 	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
268 	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
269 						/* to implement VMUSAGE_COL_* */
270 						/* flags, which aggregate by */
271 						/* project or user regardless */
272 						/* of zoneid. */
273 	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
274 						/* to track incore/not-incore */
275 	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
276 						/* amps to track incore/not- */
277 						/* incore */
278 	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
279 	size_t		vmu_nentities;		/* Count of entities in list */
280 	vmu_cache_t	*vmu_cache;		/* Cached results */
281 	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
282 						/* vmu_calculate() */
283 	uint_t		vmu_calc_flags;		/* Flags being using by */
284 						/* currently running calc */
285 						/* thread */
286 	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
287 						/* threads waiting for */
288 						/* calc thread to finish */
289 	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
290 						/* for calc thread */
291 	vmu_bound_t	*vmu_free_bounds;
292 	vmu_object_t	*vmu_free_objects;
293 	vmu_entity_t	*vmu_free_entities;
294 	vmu_zone_t	*vmu_free_zones;
295 } vmu_data_t;
296 
297 extern struct as kas;
298 extern proc_t *practive;
299 extern zone_t *global_zone;
300 extern struct seg_ops segvn_ops;
301 extern struct seg_ops segspt_shmops;
302 
303 static vmu_data_t vmu_data;
304 static kmem_cache_t *vmu_bound_cache;
305 static kmem_cache_t *vmu_object_cache;
306 
307 /*
308  * Comparison routine for AVL tree. We base our comparison on vmb_start.
309  */
310 static int
bounds_cmp(const void * bnd1,const void * bnd2)311 bounds_cmp(const void *bnd1, const void *bnd2)
312 {
313 	const vmu_bound_t *bound1 = bnd1;
314 	const vmu_bound_t *bound2 = bnd2;
315 
316 	if (bound1->vmb_start == bound2->vmb_start) {
317 		return (0);
318 	}
319 	if (bound1->vmb_start < bound2->vmb_start) {
320 		return (-1);
321 	}
322 
323 	return (1);
324 }
325 
326 /*
327  * Save a bound on the free list.
328  */
329 static void
vmu_free_bound(vmu_bound_t * bound)330 vmu_free_bound(vmu_bound_t *bound)
331 {
332 	bound->vmb_next = vmu_data.vmu_free_bounds;
333 	bound->vmb_start = 0;
334 	bound->vmb_end = 0;
335 	bound->vmb_type = 0;
336 	vmu_data.vmu_free_bounds = bound;
337 }
338 
339 /*
340  * Free an object, and all visited bound info.
341  */
342 static void
vmu_free_object(mod_hash_val_t val)343 vmu_free_object(mod_hash_val_t val)
344 {
345 	vmu_object_t *obj = (vmu_object_t *)val;
346 	avl_tree_t *tree = &(obj->vmo_bounds);
347 	vmu_bound_t *bound;
348 	void *cookie = NULL;
349 
350 	while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
351 		vmu_free_bound(bound);
352 	avl_destroy(tree);
353 
354 	obj->vmo_type = 0;
355 	obj->vmo_next = vmu_data.vmu_free_objects;
356 	vmu_data.vmu_free_objects = obj;
357 }
358 
359 /*
360  * Free an entity, and hashes of visited objects for that entity.
361  */
362 static void
vmu_free_entity(mod_hash_val_t val)363 vmu_free_entity(mod_hash_val_t val)
364 {
365 	vmu_entity_t *entity = (vmu_entity_t *)val;
366 
367 	if (entity->vme_vnode_hash != NULL)
368 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
369 	if (entity->vme_amp_hash != NULL)
370 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
371 	if (entity->vme_anon_hash != NULL)
372 		i_mod_hash_clear_nosync(entity->vme_anon_hash);
373 
374 	entity->vme_next = vmu_data.vmu_free_entities;
375 	vmu_data.vmu_free_entities = entity;
376 }
377 
378 /*
379  * Free zone entity, and all hashes of entities inside that zone,
380  * which are projects, tasks, and users.
381  */
382 static void
vmu_free_zone(mod_hash_val_t val)383 vmu_free_zone(mod_hash_val_t val)
384 {
385 	vmu_zone_t *zone = (vmu_zone_t *)val;
386 
387 	if (zone->vmz_zone != NULL) {
388 		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
389 		zone->vmz_zone = NULL;
390 	}
391 	if (zone->vmz_projects_hash != NULL)
392 		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
393 	if (zone->vmz_tasks_hash != NULL)
394 		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
395 	if (zone->vmz_rusers_hash != NULL)
396 		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
397 	if (zone->vmz_eusers_hash != NULL)
398 		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
399 	zone->vmz_next = vmu_data.vmu_free_zones;
400 	vmu_data.vmu_free_zones = zone;
401 }
402 
403 /*
404  * Initialize synchronization primitives and hashes for system-wide tracking
405  * of visited vnodes and shared amps.  Initialize results cache.
406  */
407 void
vm_usage_init()408 vm_usage_init()
409 {
410 	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
411 	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
412 
413 	vmu_data.vmu_system = NULL;
414 	vmu_data.vmu_zones_hash = NULL;
415 	vmu_data.vmu_projects_col_hash = NULL;
416 	vmu_data.vmu_rusers_col_hash = NULL;
417 	vmu_data.vmu_eusers_col_hash = NULL;
418 
419 	vmu_data.vmu_free_bounds = NULL;
420 	vmu_data.vmu_free_objects = NULL;
421 	vmu_data.vmu_free_entities = NULL;
422 	vmu_data.vmu_free_zones = NULL;
423 
424 	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
425 	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
426 	    sizeof (vnode_t));
427 	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
428 	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
429 	    sizeof (struct anon_map));
430 	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
431 	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
432 	    vmu_free_entity);
433 	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
434 	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
435 	    vmu_free_entity);
436 	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
437 	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
438 	    vmu_free_entity);
439 	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
440 	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
441 
442 	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
443 	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
444 	vmu_object_cache = kmem_cache_create("vmu_object_cache",
445 	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
446 
447 	vmu_data.vmu_entities = NULL;
448 	vmu_data.vmu_nentities = 0;
449 
450 	vmu_data.vmu_cache = NULL;
451 	vmu_data.vmu_calc_thread = NULL;
452 	vmu_data.vmu_calc_flags = 0;
453 	vmu_data.vmu_pending_flags = 0;
454 	vmu_data.vmu_pending_waiters = 0;
455 }
456 
457 /*
458  * Allocate hashes for tracking vm objects visited for an entity.
459  * Update list of entities.
460  */
461 static vmu_entity_t *
vmu_alloc_entity(id_t id,int type,id_t zoneid)462 vmu_alloc_entity(id_t id, int type, id_t zoneid)
463 {
464 	vmu_entity_t *entity;
465 
466 	if (vmu_data.vmu_free_entities != NULL) {
467 		entity = vmu_data.vmu_free_entities;
468 		vmu_data.vmu_free_entities =
469 		    vmu_data.vmu_free_entities->vme_next;
470 		bzero(&entity->vme_result, sizeof (vmusage_t));
471 	} else {
472 		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
473 	}
474 	entity->vme_result.vmu_id = id;
475 	entity->vme_result.vmu_zoneid = zoneid;
476 	entity->vme_result.vmu_type = type;
477 
478 	if (entity->vme_vnode_hash == NULL)
479 		entity->vme_vnode_hash = mod_hash_create_ptrhash(
480 		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
481 		    sizeof (vnode_t));
482 
483 	if (entity->vme_amp_hash == NULL)
484 		entity->vme_amp_hash = mod_hash_create_ptrhash(
485 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
486 		    sizeof (struct anon_map));
487 
488 	if (entity->vme_anon_hash == NULL)
489 		entity->vme_anon_hash = mod_hash_create_ptrhash(
490 		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
491 		    mod_hash_null_valdtor, sizeof (struct anon));
492 
493 	entity->vme_next = vmu_data.vmu_entities;
494 	vmu_data.vmu_entities = entity;
495 	vmu_data.vmu_nentities++;
496 
497 	return (entity);
498 }
499 
500 /*
501  * Allocate a zone entity, and hashes for tracking visited vm objects
502  * for projects, tasks, and users within that zone.
503  */
504 static vmu_zone_t *
vmu_alloc_zone(id_t id)505 vmu_alloc_zone(id_t id)
506 {
507 	vmu_zone_t *zone;
508 
509 	if (vmu_data.vmu_free_zones != NULL) {
510 		zone = vmu_data.vmu_free_zones;
511 		vmu_data.vmu_free_zones =
512 		    vmu_data.vmu_free_zones->vmz_next;
513 		zone->vmz_next = NULL;
514 		zone->vmz_zone = NULL;
515 	} else {
516 		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
517 	}
518 
519 	zone->vmz_id = id;
520 
521 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
522 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
523 
524 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
525 	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
526 		zone->vmz_projects_hash = mod_hash_create_idhash(
527 		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
528 
529 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
530 	    != 0 && zone->vmz_tasks_hash == NULL)
531 		zone->vmz_tasks_hash = mod_hash_create_idhash(
532 		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533 
534 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
535 	    != 0 && zone->vmz_rusers_hash == NULL)
536 		zone->vmz_rusers_hash = mod_hash_create_idhash(
537 		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538 
539 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
540 	    != 0 && zone->vmz_eusers_hash == NULL)
541 		zone->vmz_eusers_hash = mod_hash_create_idhash(
542 		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
543 
544 	return (zone);
545 }
546 
547 /*
548  * Allocate a structure for tracking visited bounds for a vm object.
549  */
550 static vmu_object_t *
vmu_alloc_object(caddr_t key,int type)551 vmu_alloc_object(caddr_t key, int type)
552 {
553 	vmu_object_t *object;
554 
555 	if (vmu_data.vmu_free_objects != NULL) {
556 		object = vmu_data.vmu_free_objects;
557 		vmu_data.vmu_free_objects =
558 		    vmu_data.vmu_free_objects->vmo_next;
559 	} else {
560 		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
561 	}
562 
563 	object->vmo_next = NULL;
564 	object->vmo_key = key;
565 	object->vmo_type = type;
566 	avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
567 
568 	return (object);
569 }
570 
571 /*
572  * Allocate and return a bound structure.
573  */
574 static vmu_bound_t *
vmu_alloc_bound()575 vmu_alloc_bound()
576 {
577 	vmu_bound_t *bound;
578 
579 	if (vmu_data.vmu_free_bounds != NULL) {
580 		bound = vmu_data.vmu_free_bounds;
581 		vmu_data.vmu_free_bounds =
582 		    vmu_data.vmu_free_bounds->vmb_next;
583 	} else {
584 		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
585 	}
586 
587 	bound->vmb_next = NULL;
588 	bound->vmb_start = 0;
589 	bound->vmb_end = 0;
590 	bound->vmb_type = 0;
591 	return (bound);
592 }
593 
594 /*
595  * vmu_find_insert_* functions implement hash lookup or allocate and
596  * insert operations.
597  */
598 static vmu_object_t *
vmu_find_insert_object(mod_hash_t * hash,caddr_t key,uint_t type)599 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
600 {
601 	int ret;
602 	vmu_object_t *object;
603 
604 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
605 	    (mod_hash_val_t *)&object);
606 	if (ret != 0) {
607 		object = vmu_alloc_object(key, type);
608 		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
609 		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
610 		ASSERT(ret == 0);
611 	}
612 	return (object);
613 }
614 
615 static int
vmu_find_insert_anon(mod_hash_t * hash,caddr_t key)616 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
617 {
618 	int ret;
619 	caddr_t val;
620 
621 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
622 	    (mod_hash_val_t *)&val);
623 
624 	if (ret == 0)
625 		return (0);
626 
627 	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
628 	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
629 
630 	ASSERT(ret == 0);
631 
632 	return (1);
633 }
634 
635 static vmu_entity_t *
vmu_find_insert_entity(mod_hash_t * hash,id_t id,uint_t type,id_t zoneid)636 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
637 {
638 	int ret;
639 	vmu_entity_t *entity;
640 
641 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
642 	    (mod_hash_val_t *)&entity);
643 	if (ret != 0) {
644 		entity = vmu_alloc_entity(id, type, zoneid);
645 		ret = i_mod_hash_insert_nosync(hash,
646 		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
647 		    (mod_hash_hndl_t)0);
648 		ASSERT(ret == 0);
649 	}
650 	return (entity);
651 }
652 
653 
654 
655 
656 /*
657  * Returns list of object bounds between start and end.  New bounds inserted
658  * by this call are given type.
659  *
660  * Returns the number of pages covered if new bounds are created.  Returns 0
661  * if region between start/end consists of all existing bounds.
662  */
663 static pgcnt_t
vmu_insert_lookup_object_bounds(vmu_object_t * ro,pgcnt_t start,pgcnt_t end,char type,vmu_bound_t ** first,vmu_bound_t ** last)664 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
665     end, char type, vmu_bound_t **first, vmu_bound_t **last)
666 {
667 	avl_tree_t	*tree = &(ro->vmo_bounds);
668 	avl_index_t	where;
669 	vmu_bound_t	*walker, *tmp;
670 	pgcnt_t		ret = 0;
671 
672 	ASSERT(start <= end);
673 
674 	*first = *last = NULL;
675 
676 	tmp = vmu_alloc_bound();
677 	tmp->vmb_start = start;
678 	tmp->vmb_type = type;
679 
680 	/* Hopelessly optimistic case. */
681 	if (walker = avl_find(tree, tmp, &where)) {
682 		/* We got lucky. */
683 		vmu_free_bound(tmp);
684 		*first = walker;
685 	}
686 
687 	if (walker == NULL) {
688 		/* Is start in the previous node? */
689 		walker = avl_nearest(tree, where, AVL_BEFORE);
690 		if (walker != NULL) {
691 			if (ISWITHIN(walker, start)) {
692 				/* We found start. */
693 				vmu_free_bound(tmp);
694 				*first = walker;
695 			}
696 		}
697 	}
698 
699 	/*
700 	 * At this point, if *first is still NULL, then we
701 	 * didn't get a direct hit and start isn't covered
702 	 * by the previous node. We know that the next node
703 	 * must have a greater start value than we require
704 	 * because avl_find tells us where the AVL routines would
705 	 * insert our new node. We have some gap between the
706 	 * start we want and the next node.
707 	 */
708 	if (*first == NULL) {
709 		walker = avl_nearest(tree, where, AVL_AFTER);
710 		if (walker != NULL && walker->vmb_start <= end) {
711 			/* Fill the gap. */
712 			tmp->vmb_end = walker->vmb_start - 1;
713 			*first = tmp;
714 		} else {
715 			/* We have a gap over [start, end]. */
716 			tmp->vmb_end = end;
717 			*first = *last = tmp;
718 		}
719 		ret += tmp->vmb_end - tmp->vmb_start + 1;
720 		avl_insert(tree, tmp, where);
721 	}
722 
723 	ASSERT(*first != NULL);
724 
725 	if (*last != NULL) {
726 		/* We're done. */
727 		return (ret);
728 	}
729 
730 	/*
731 	 * If we are here we still need to set *last and
732 	 * that may involve filling in some gaps.
733 	 */
734 	*last = *first;
735 	for (;;) {
736 		if (ISWITHIN(*last, end)) {
737 			/* We're done. */
738 			break;
739 		}
740 		walker = AVL_NEXT(tree, *last);
741 		if (walker == NULL || walker->vmb_start > end) {
742 			/* Bottom or mid tree with gap. */
743 			tmp = vmu_alloc_bound();
744 			tmp->vmb_start = (*last)->vmb_end + 1;
745 			tmp->vmb_end = end;
746 			tmp->vmb_type = type;
747 			ret += tmp->vmb_end - tmp->vmb_start + 1;
748 			avl_insert_here(tree, tmp, *last, AVL_AFTER);
749 			*last = tmp;
750 			break;
751 		} else {
752 			if ((*last)->vmb_end + 1 != walker->vmb_start) {
753 				/* Non-contiguous. */
754 				tmp = vmu_alloc_bound();
755 				tmp->vmb_start = (*last)->vmb_end + 1;
756 				tmp->vmb_end = walker->vmb_start - 1;
757 				tmp->vmb_type = type;
758 				ret += tmp->vmb_end - tmp->vmb_start + 1;
759 				avl_insert_here(tree, tmp, *last, AVL_AFTER);
760 				*last = tmp;
761 			} else {
762 				*last = walker;
763 			}
764 		}
765 	}
766 
767 	return (ret);
768 }
769 
770 /*
771  * vmu_update_bounds()
772  *
773  * tree: avl_tree in which first and last hang.
774  *
775  * first, last:	list of continuous bounds, of which zero or more are of
776  * 		type VMUSAGE_BOUND_UNKNOWN.
777  *
778  * new_tree: avl_tree in which new_first and new_last hang.
779  *
780  * new_first, new_last:	list of continuous bounds, of which none are of
781  *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
782  *			update the types of bounds in (first,last) with
783  *			type VMUSAGE_BOUND_UNKNOWN.
784  *
785  * For the list of bounds (first,last), this function updates any bounds
786  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
787  * the list (new_first, new_last).
788  *
789  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
790  * (new_first, new_last), it will be split into multiple bounds.
791  *
792  * Return value:
793  * 	The number of pages in the list of bounds (first,last) that were of
794  *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
795  *	VMUSAGE_BOUND_INCORE.
796  *
797  */
798 static pgcnt_t
vmu_update_bounds(avl_tree_t * tree,vmu_bound_t ** first,vmu_bound_t ** last,avl_tree_t * new_tree,vmu_bound_t * new_first,vmu_bound_t * new_last)799 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
800     avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
801 {
802 	vmu_bound_t *next, *new_next, *tmp;
803 	pgcnt_t rss = 0;
804 
805 	next = *first;
806 	new_next = new_first;
807 
808 	/*
809 	 * Verify first and last bound are covered by new bounds if they
810 	 * have unknown type.
811 	 */
812 	ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
813 	    (*first)->vmb_start >= new_first->vmb_start);
814 	ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
815 	    (*last)->vmb_end <= new_last->vmb_end);
816 	for (;;) {
817 		/* If bound already has type, proceed to next bound. */
818 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
819 			if (next == *last)
820 				break;
821 			next = AVL_NEXT(tree, next);
822 			continue;
823 		}
824 		while (new_next->vmb_end < next->vmb_start)
825 			new_next = AVL_NEXT(new_tree, new_next);
826 		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
827 		next->vmb_type = new_next->vmb_type;
828 		if (new_next->vmb_end < next->vmb_end) {
829 			/* need to split bound */
830 			tmp = vmu_alloc_bound();
831 			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
832 			tmp->vmb_start = new_next->vmb_end + 1;
833 			tmp->vmb_end = next->vmb_end;
834 			avl_insert_here(tree, tmp, next, AVL_AFTER);
835 			next->vmb_end = new_next->vmb_end;
836 			if (*last == next)
837 				*last = tmp;
838 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
839 				rss += next->vmb_end - next->vmb_start + 1;
840 			next = tmp;
841 		} else {
842 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
843 				rss += next->vmb_end - next->vmb_start + 1;
844 			if (next == *last)
845 				break;
846 			next = AVL_NEXT(tree, next);
847 		}
848 	}
849 	return (rss);
850 }
851 
852 /*
853  * Merges adjacent bounds with same type between first and last bound.
854  * After merge, last pointer may point to a different bound, as (incoming)
855  * last bound may have been merged away.
856  */
857 static void
vmu_merge_bounds(avl_tree_t * tree,vmu_bound_t ** first,vmu_bound_t ** last)858 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
859 {
860 	vmu_bound_t *current;
861 	vmu_bound_t *next;
862 
863 	ASSERT(tree != NULL);
864 	ASSERT(*first != NULL);
865 	ASSERT(*last != NULL);
866 
867 	current = *first;
868 	while (current != *last) {
869 		next = AVL_NEXT(tree, current);
870 		if ((current->vmb_end + 1) == next->vmb_start &&
871 		    current->vmb_type == next->vmb_type) {
872 			current->vmb_end = next->vmb_end;
873 			avl_remove(tree, next);
874 			vmu_free_bound(next);
875 			if (next == *last) {
876 				*last = current;
877 			}
878 		} else {
879 			current = AVL_NEXT(tree, current);
880 		}
881 	}
882 }
883 
884 /*
885  * Given an amp and a list of bounds, updates each bound's type with
886  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
887  *
888  * If a bound is partially incore, it will be split into two bounds.
889  * first and last may be modified, as bounds may be split into multiple
890  * bounds if they are partially incore/not-incore.
891  *
892  * Set incore to non-zero if bounds are already known to be incore.
893  *
894  */
895 static void
vmu_amp_update_incore_bounds(avl_tree_t * tree,struct anon_map * amp,vmu_bound_t ** first,vmu_bound_t ** last,boolean_t incore)896 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
897     vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
898 {
899 	vmu_bound_t *next;
900 	vmu_bound_t *tmp;
901 	pgcnt_t index;
902 	short bound_type;
903 	short page_type;
904 	vnode_t *vn;
905 	anoff_t off;
906 	struct anon *ap;
907 
908 	next = *first;
909 	/* Shared anon slots don't change once set. */
910 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
911 	for (;;) {
912 		if (incore == B_TRUE)
913 			next->vmb_type = VMUSAGE_BOUND_INCORE;
914 
915 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
916 			if (next == *last)
917 				break;
918 			next = AVL_NEXT(tree, next);
919 			continue;
920 		}
921 		bound_type = next->vmb_type;
922 		index = next->vmb_start;
923 		while (index <= next->vmb_end) {
924 
925 			/*
926 			 * These are used to determine how much to increment
927 			 * index when a large page is found.
928 			 */
929 			page_t *page;
930 			pgcnt_t pgcnt = 1;
931 			uint_t pgshft;
932 			pgcnt_t pgmsk;
933 
934 			ap = anon_get_ptr(amp->ahp, index);
935 			if (ap != NULL)
936 				swap_xlate(ap, &vn, &off);
937 
938 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
939 			    (page = page_exists(vn, off)) != NULL) {
940 				page_type = VMUSAGE_BOUND_INCORE;
941 				if (page->p_szc > 0) {
942 					pgcnt = page_get_pagecnt(page->p_szc);
943 					pgshft = page_get_shift(page->p_szc);
944 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
945 					    - 1;
946 				}
947 			} else {
948 				page_type = VMUSAGE_BOUND_NOT_INCORE;
949 			}
950 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
951 				next->vmb_type = page_type;
952 			} else if (next->vmb_type != page_type) {
953 				/*
954 				 * If current bound type does not match page
955 				 * type, need to split off new bound.
956 				 */
957 				tmp = vmu_alloc_bound();
958 				tmp->vmb_type = page_type;
959 				tmp->vmb_start = index;
960 				tmp->vmb_end = next->vmb_end;
961 				avl_insert_here(tree, tmp, next, AVL_AFTER);
962 				next->vmb_end = index - 1;
963 				if (*last == next)
964 					*last = tmp;
965 				next = tmp;
966 			}
967 			if (pgcnt > 1) {
968 				/*
969 				 * If inside large page, jump to next large
970 				 * page
971 				 */
972 				index = (index & ~pgmsk) + pgcnt;
973 			} else {
974 				index++;
975 			}
976 		}
977 		if (next == *last) {
978 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
979 			break;
980 		} else
981 			next = AVL_NEXT(tree, next);
982 	}
983 	ANON_LOCK_EXIT(&amp->a_rwlock);
984 }
985 
986 /*
987  * Same as vmu_amp_update_incore_bounds(), except for tracking
988  * incore-/not-incore for vnodes.
989  */
990 static void
vmu_vnode_update_incore_bounds(avl_tree_t * tree,vnode_t * vnode,vmu_bound_t ** first,vmu_bound_t ** last)991 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
992     vmu_bound_t **first, vmu_bound_t **last)
993 {
994 	vmu_bound_t *next;
995 	vmu_bound_t *tmp;
996 	pgcnt_t index;
997 	short bound_type;
998 	short page_type;
999 
1000 	next = *first;
1001 	for (;;) {
1002 		if (vnode->v_pages == NULL)
1003 			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004 
1005 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006 			if (next == *last)
1007 				break;
1008 			next = AVL_NEXT(tree, next);
1009 			continue;
1010 		}
1011 
1012 		bound_type = next->vmb_type;
1013 		index = next->vmb_start;
1014 		while (index <= next->vmb_end) {
1015 
1016 			/*
1017 			 * These are used to determine how much to increment
1018 			 * index when a large page is found.
1019 			 */
1020 			page_t *page;
1021 			pgcnt_t pgcnt = 1;
1022 			uint_t pgshft;
1023 			pgcnt_t pgmsk;
1024 
1025 			if (vnode->v_pages != NULL &&
1026 			    (page = page_exists(vnode, ptob(index))) != NULL) {
1027 				page_type = VMUSAGE_BOUND_INCORE;
1028 				if (page->p_szc > 0) {
1029 					pgcnt = page_get_pagecnt(page->p_szc);
1030 					pgshft = page_get_shift(page->p_szc);
1031 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032 					    - 1;
1033 				}
1034 			} else {
1035 				page_type = VMUSAGE_BOUND_NOT_INCORE;
1036 			}
1037 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038 				next->vmb_type = page_type;
1039 			} else if (next->vmb_type != page_type) {
1040 				/*
1041 				 * If current bound type does not match page
1042 				 * type, need to split off new bound.
1043 				 */
1044 				tmp = vmu_alloc_bound();
1045 				tmp->vmb_type = page_type;
1046 				tmp->vmb_start = index;
1047 				tmp->vmb_end = next->vmb_end;
1048 				avl_insert_here(tree, tmp, next, AVL_AFTER);
1049 				next->vmb_end = index - 1;
1050 				if (*last == next)
1051 					*last = tmp;
1052 				next = tmp;
1053 			}
1054 			if (pgcnt > 1) {
1055 				/*
1056 				 * If inside large page, jump to next large
1057 				 * page
1058 				 */
1059 				index = (index & ~pgmsk) + pgcnt;
1060 			} else {
1061 				index++;
1062 			}
1063 		}
1064 		if (next == *last) {
1065 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1066 			break;
1067 		} else
1068 			next = AVL_NEXT(tree, next);
1069 	}
1070 }
1071 
1072 /*
1073  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1074  * list of entities to visit.  For shared segments, the vnode or amp
1075  * is looked up in each entity to see if it has been already counted.  Private
1076  * anon pages are checked per entity to ensure that COW pages are not
1077  * double counted.
1078  *
1079  * For private mapped files, first the amp is checked for private pages.
1080  * Bounds not backed by the amp are looked up in the vnode for each entity
1081  * to avoid double counting of private COW vnode pages.
1082  */
1083 static void
vmu_calculate_seg(vmu_entity_t * vmu_entities,struct seg * seg)1084 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1085 {
1086 	struct segvn_data *svd;
1087 	struct shm_data *shmd;
1088 	struct spt_data *sptd;
1089 	vmu_object_t *shared_object = NULL;
1090 	vmu_object_t *entity_object = NULL;
1091 	vmu_entity_t *entity;
1092 	vmusage_t *result;
1093 	vmu_bound_t *first = NULL;
1094 	vmu_bound_t *last = NULL;
1095 	vmu_bound_t *cur = NULL;
1096 	vmu_bound_t *e_first = NULL;
1097 	vmu_bound_t *e_last = NULL;
1098 	vmu_bound_t *tmp;
1099 	pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1100 	struct anon_map *private_amp = NULL;
1101 	boolean_t incore = B_FALSE;
1102 	boolean_t shared = B_FALSE;
1103 	int file = 0;
1104 	pgcnt_t swresv = 0;
1105 	pgcnt_t panon = 0;
1106 
1107 	s_start = 0;
1108 	p_end = 0;
1109 	/* Can zero-length segments exist?  Not sure, so paranoia. */
1110 	if (seg->s_size <= 0)
1111 		return;
1112 
1113 	/*
1114 	 * Figure out if there is a shared object (such as a named vnode or
1115 	 * a shared amp, then figure out if there is a private amp, which
1116 	 * identifies private pages.
1117 	 */
1118 	if (seg->s_ops == &segvn_ops) {
1119 		svd = (struct segvn_data *)seg->s_data;
1120 		if (svd->type == MAP_SHARED) {
1121 			shared = B_TRUE;
1122 		} else {
1123 			swresv = svd->swresv;
1124 
1125 			if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1126 			    RW_READER) != 0) {
1127 				/*
1128 				 * Text replication anon maps can be shared
1129 				 * across all zones. Space used for text
1130 				 * replication is typically capped as a small %
1131 				 * of memory.  To keep it simple for now we
1132 				 * don't account for swap and memory space used
1133 				 * for text replication.
1134 				 */
1135 				if (svd->tr_state == SEGVN_TR_OFF &&
1136 				    svd->amp != NULL) {
1137 					private_amp = svd->amp;
1138 					p_start = svd->anon_index;
1139 					p_end = svd->anon_index +
1140 					    btop(seg->s_size) - 1;
1141 				}
1142 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1143 			}
1144 		}
1145 		if (svd->vp != NULL) {
1146 			file = 1;
1147 			shared_object = vmu_find_insert_object(
1148 			    vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1149 			    VMUSAGE_TYPE_VNODE);
1150 			s_start = btop(svd->offset);
1151 			s_end = btop(svd->offset + seg->s_size) - 1;
1152 		}
1153 		if (svd->amp != NULL && svd->type == MAP_SHARED) {
1154 			ASSERT(shared_object == NULL);
1155 			shared_object = vmu_find_insert_object(
1156 			    vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1157 			    VMUSAGE_TYPE_AMP);
1158 			s_start = svd->anon_index;
1159 			s_end = svd->anon_index + btop(seg->s_size) - 1;
1160 			/* schedctl mappings are always in core */
1161 			if (svd->amp->swresv == 0)
1162 				incore = B_TRUE;
1163 		}
1164 	} else if (seg->s_ops == &segspt_shmops) {
1165 		shared = B_TRUE;
1166 		shmd = (struct shm_data *)seg->s_data;
1167 		shared_object = vmu_find_insert_object(
1168 		    vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1169 		    VMUSAGE_TYPE_AMP);
1170 		s_start = 0;
1171 		s_end = btop(seg->s_size) - 1;
1172 		sptd = shmd->shm_sptseg->s_data;
1173 
1174 		/* ism segments are always incore and do not reserve swap */
1175 		if (sptd->spt_flags & SHM_SHARE_MMU)
1176 			incore = B_TRUE;
1177 
1178 	} else {
1179 		return;
1180 	}
1181 
1182 	/*
1183 	 * If there is a private amp, count anon pages that exist.  If an
1184 	 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1185 	 * hash so that it is not double counted.
1186 	 *
1187 	 * If there is also a shared object, then figure out the bounds
1188 	 * which are not mapped by the private amp.
1189 	 */
1190 	if (private_amp != NULL) {
1191 
1192 		/* Enter as writer to prevent COW anons from being freed */
1193 		ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1194 
1195 		p_index = p_start;
1196 		s_index = s_start;
1197 
1198 		while (p_index <= p_end) {
1199 
1200 			pgcnt_t p_index_next;
1201 			pgcnt_t p_bound_size;
1202 			int cnt;
1203 			anoff_t off;
1204 			struct vnode *vn;
1205 			struct anon *ap;
1206 			page_t *page;		/* For handling of large */
1207 			pgcnt_t pgcnt = 1;	/* pages */
1208 			pgcnt_t pgstart;
1209 			pgcnt_t pgend;
1210 			uint_t pgshft;
1211 			pgcnt_t pgmsk;
1212 
1213 			p_index_next = p_index;
1214 			ap = anon_get_next_ptr(private_amp->ahp,
1215 			    &p_index_next);
1216 
1217 			/*
1218 			 * If next anon is past end of mapping, simulate
1219 			 * end of anon so loop terminates.
1220 			 */
1221 			if (p_index_next > p_end) {
1222 				p_index_next = p_end + 1;
1223 				ap = NULL;
1224 			}
1225 			/*
1226 			 * For COW segments, keep track of bounds not
1227 			 * backed by private amp so they can be looked
1228 			 * up in the backing vnode
1229 			 */
1230 			if (p_index_next != p_index) {
1231 
1232 				/*
1233 				 * Compute index difference between anon and
1234 				 * previous anon.
1235 				 */
1236 				p_bound_size = p_index_next - p_index - 1;
1237 
1238 				if (shared_object != NULL) {
1239 					cur = vmu_alloc_bound();
1240 					cur->vmb_start = s_index;
1241 					cur->vmb_end = s_index + p_bound_size;
1242 					cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1243 					if (first == NULL) {
1244 						first = cur;
1245 						last = cur;
1246 					} else {
1247 						last->vmb_next = cur;
1248 						last = cur;
1249 					}
1250 				}
1251 				p_index = p_index + p_bound_size + 1;
1252 				s_index = s_index + p_bound_size + 1;
1253 			}
1254 
1255 			/* Detect end of anons in amp */
1256 			if (ap == NULL)
1257 				break;
1258 
1259 			cnt = ap->an_refcnt;
1260 			swap_xlate(ap, &vn, &off);
1261 
1262 			if (vn == NULL || vn->v_pages == NULL ||
1263 			    (page = page_exists(vn, off)) == NULL) {
1264 				p_index++;
1265 				s_index++;
1266 				continue;
1267 			}
1268 
1269 			/*
1270 			 * If large page is found, compute portion of large
1271 			 * page in mapping, and increment indicies to the next
1272 			 * large page.
1273 			 */
1274 			if (page->p_szc > 0) {
1275 
1276 				pgcnt = page_get_pagecnt(page->p_szc);
1277 				pgshft = page_get_shift(page->p_szc);
1278 				pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1279 
1280 				/* First page in large page */
1281 				pgstart = p_index & ~pgmsk;
1282 				/* Last page in large page */
1283 				pgend = pgstart + pgcnt - 1;
1284 				/*
1285 				 * Artifically end page if page extends past
1286 				 * end of mapping.
1287 				 */
1288 				if (pgend > p_end)
1289 					pgend = p_end;
1290 
1291 				/*
1292 				 * Compute number of pages from large page
1293 				 * which are mapped.
1294 				 */
1295 				pgcnt = pgend - p_index + 1;
1296 
1297 				/*
1298 				 * Point indicies at page after large page,
1299 				 * or at page after end of mapping.
1300 				 */
1301 				p_index += pgcnt;
1302 				s_index += pgcnt;
1303 			} else {
1304 				p_index++;
1305 				s_index++;
1306 			}
1307 
1308 			/*
1309 			 * Assume anon structs with a refcnt
1310 			 * of 1 are not COW shared, so there
1311 			 * is no reason to track them per entity.
1312 			 */
1313 			if (cnt == 1) {
1314 				panon += pgcnt;
1315 				continue;
1316 			}
1317 			for (entity = vmu_entities; entity != NULL;
1318 			    entity = entity->vme_next_calc) {
1319 
1320 				result = &entity->vme_result;
1321 				/*
1322 				 * Track COW anons per entity so
1323 				 * they are not double counted.
1324 				 */
1325 				if (vmu_find_insert_anon(entity->vme_anon_hash,
1326 				    (caddr_t)ap) == 0)
1327 					continue;
1328 
1329 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
1330 				result->vmu_rss_private +=
1331 				    (pgcnt << PAGESHIFT);
1332 			}
1333 		}
1334 		ANON_LOCK_EXIT(&private_amp->a_rwlock);
1335 	}
1336 
1337 	/* Add up resident anon and swap reserved for private mappings */
1338 	if (swresv > 0 || panon > 0) {
1339 		for (entity = vmu_entities; entity != NULL;
1340 		    entity = entity->vme_next_calc) {
1341 			result = &entity->vme_result;
1342 			result->vmu_swap_all += swresv;
1343 			result->vmu_swap_private += swresv;
1344 			result->vmu_rss_all += (panon << PAGESHIFT);
1345 			result->vmu_rss_private += (panon << PAGESHIFT);
1346 		}
1347 	}
1348 
1349 	/* Compute resident pages backing shared amp or named vnode */
1350 	if (shared_object != NULL) {
1351 		avl_tree_t *tree = &(shared_object->vmo_bounds);
1352 
1353 		if (first == NULL) {
1354 			/*
1355 			 * No private amp, or private amp has no anon
1356 			 * structs.  This means entire segment is backed by
1357 			 * the shared object.
1358 			 */
1359 			first = vmu_alloc_bound();
1360 			first->vmb_start = s_start;
1361 			first->vmb_end = s_end;
1362 			first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1363 		}
1364 		/*
1365 		 * Iterate bounds not backed by private amp, and compute
1366 		 * resident pages.
1367 		 */
1368 		cur = first;
1369 		while (cur != NULL) {
1370 
1371 			if (vmu_insert_lookup_object_bounds(shared_object,
1372 			    cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1373 			    &first, &last) > 0) {
1374 				/* new bounds, find incore/not-incore */
1375 				if (shared_object->vmo_type ==
1376 				    VMUSAGE_TYPE_VNODE) {
1377 					vmu_vnode_update_incore_bounds(
1378 					    tree,
1379 					    (vnode_t *)
1380 					    shared_object->vmo_key, &first,
1381 					    &last);
1382 				} else {
1383 					vmu_amp_update_incore_bounds(
1384 					    tree,
1385 					    (struct anon_map *)
1386 					    shared_object->vmo_key, &first,
1387 					    &last, incore);
1388 				}
1389 				vmu_merge_bounds(tree, &first, &last);
1390 			}
1391 			for (entity = vmu_entities; entity != NULL;
1392 			    entity = entity->vme_next_calc) {
1393 				avl_tree_t *e_tree;
1394 
1395 				result = &entity->vme_result;
1396 
1397 				entity_object = vmu_find_insert_object(
1398 				    shared_object->vmo_type ==
1399 				    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1400 				    entity->vme_amp_hash,
1401 				    shared_object->vmo_key,
1402 				    shared_object->vmo_type);
1403 
1404 				virt = vmu_insert_lookup_object_bounds(
1405 				    entity_object, cur->vmb_start, cur->vmb_end,
1406 				    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1407 
1408 				if (virt == 0)
1409 					continue;
1410 				/*
1411 				 * Range visited for this entity
1412 				 */
1413 				e_tree = &(entity_object->vmo_bounds);
1414 				rss = vmu_update_bounds(e_tree, &e_first,
1415 				    &e_last, tree, first, last);
1416 				result->vmu_rss_all += (rss << PAGESHIFT);
1417 				if (shared == B_TRUE && file == B_FALSE) {
1418 					/* shared anon mapping */
1419 					result->vmu_swap_all +=
1420 					    (virt << PAGESHIFT);
1421 					result->vmu_swap_shared +=
1422 					    (virt << PAGESHIFT);
1423 					result->vmu_rss_shared +=
1424 					    (rss << PAGESHIFT);
1425 				} else if (shared == B_TRUE && file == B_TRUE) {
1426 					/* shared file mapping */
1427 					result->vmu_rss_shared +=
1428 					    (rss << PAGESHIFT);
1429 				} else if (shared == B_FALSE &&
1430 				    file == B_TRUE) {
1431 					/* private file mapping */
1432 					result->vmu_rss_private +=
1433 					    (rss << PAGESHIFT);
1434 				}
1435 				vmu_merge_bounds(e_tree, &e_first, &e_last);
1436 			}
1437 			tmp = cur;
1438 			cur = cur->vmb_next;
1439 			vmu_free_bound(tmp);
1440 		}
1441 	}
1442 }
1443 
1444 /*
1445  * Based on the current calculation flags, find the relevant entities
1446  * which are relative to the process.  Then calculate each segment
1447  * in the process'es address space for each relevant entity.
1448  */
1449 static void
vmu_calculate_proc(proc_t * p)1450 vmu_calculate_proc(proc_t *p)
1451 {
1452 	vmu_entity_t *entities = NULL;
1453 	vmu_zone_t *zone;
1454 	vmu_entity_t *tmp;
1455 	struct as *as;
1456 	struct seg *seg;
1457 	int ret;
1458 
1459 	/* Figure out which entities are being computed */
1460 	if ((vmu_data.vmu_system) != NULL) {
1461 		tmp = vmu_data.vmu_system;
1462 		tmp->vme_next_calc = entities;
1463 		entities = tmp;
1464 	}
1465 	if (vmu_data.vmu_calc_flags &
1466 	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1467 	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1468 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1469 	    VMUSAGE_ALL_EUSERS)) {
1470 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1471 		    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1472 		    (mod_hash_val_t *)&zone);
1473 		if (ret != 0) {
1474 			zone = vmu_alloc_zone(p->p_zone->zone_id);
1475 			ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1476 			    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1477 			    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1478 			ASSERT(ret == 0);
1479 		}
1480 		if (zone->vmz_zone != NULL) {
1481 			tmp = zone->vmz_zone;
1482 			tmp->vme_next_calc = entities;
1483 			entities = tmp;
1484 		}
1485 		if (vmu_data.vmu_calc_flags &
1486 		    (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1487 			tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1488 			    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1489 			    zone->vmz_id);
1490 			tmp->vme_next_calc = entities;
1491 			entities = tmp;
1492 		}
1493 		if (vmu_data.vmu_calc_flags &
1494 		    (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1495 			tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1496 			    p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1497 			tmp->vme_next_calc = entities;
1498 			entities = tmp;
1499 		}
1500 		if (vmu_data.vmu_calc_flags &
1501 		    (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1502 			tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1503 			    crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1504 			tmp->vme_next_calc = entities;
1505 			entities = tmp;
1506 		}
1507 		if (vmu_data.vmu_calc_flags &
1508 		    (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1509 			tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1510 			    crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1511 			tmp->vme_next_calc = entities;
1512 			entities = tmp;
1513 		}
1514 	}
1515 	/* Entities which collapse projects and users for all zones */
1516 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1517 		tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1518 		    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1519 		tmp->vme_next_calc = entities;
1520 		entities = tmp;
1521 	}
1522 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1523 		tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1524 		    crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1525 		tmp->vme_next_calc = entities;
1526 		entities = tmp;
1527 	}
1528 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1529 		tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1530 		    crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1531 		tmp->vme_next_calc = entities;
1532 		entities = tmp;
1533 	}
1534 
1535 	ASSERT(entities != NULL);
1536 	/* process all segs in process's address space */
1537 	as = p->p_as;
1538 	AS_LOCK_ENTER(as, RW_READER);
1539 	for (seg = AS_SEGFIRST(as); seg != NULL;
1540 	    seg = AS_SEGNEXT(as, seg)) {
1541 		vmu_calculate_seg(entities, seg);
1542 	}
1543 	AS_LOCK_EXIT(as);
1544 }
1545 
1546 /*
1547  * Free data created by previous call to vmu_calculate().
1548  */
1549 static void
vmu_clear_calc()1550 vmu_clear_calc()
1551 {
1552 	if (vmu_data.vmu_system != NULL) {
1553 		vmu_free_entity(vmu_data.vmu_system);
1554 		vmu_data.vmu_system = NULL;
1555 	}
1556 	if (vmu_data.vmu_zones_hash != NULL)
1557 		i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1558 	if (vmu_data.vmu_projects_col_hash != NULL)
1559 		i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1560 	if (vmu_data.vmu_rusers_col_hash != NULL)
1561 		i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1562 	if (vmu_data.vmu_eusers_col_hash != NULL)
1563 		i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1564 
1565 	i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1566 	i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1567 }
1568 
1569 /*
1570  * Free unused data structures.  These can result if the system workload
1571  * decreases between calculations.
1572  */
1573 static void
vmu_free_extra()1574 vmu_free_extra()
1575 {
1576 	vmu_bound_t *tb;
1577 	vmu_object_t *to;
1578 	vmu_entity_t *te;
1579 	vmu_zone_t *tz;
1580 
1581 	while (vmu_data.vmu_free_bounds != NULL) {
1582 		tb = vmu_data.vmu_free_bounds;
1583 		vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1584 		kmem_cache_free(vmu_bound_cache, tb);
1585 	}
1586 	while (vmu_data.vmu_free_objects != NULL) {
1587 		to = vmu_data.vmu_free_objects;
1588 		vmu_data.vmu_free_objects =
1589 		    vmu_data.vmu_free_objects->vmo_next;
1590 		kmem_cache_free(vmu_object_cache, to);
1591 	}
1592 	while (vmu_data.vmu_free_entities != NULL) {
1593 		te = vmu_data.vmu_free_entities;
1594 		vmu_data.vmu_free_entities =
1595 		    vmu_data.vmu_free_entities->vme_next;
1596 		if (te->vme_vnode_hash != NULL)
1597 			mod_hash_destroy_hash(te->vme_vnode_hash);
1598 		if (te->vme_amp_hash != NULL)
1599 			mod_hash_destroy_hash(te->vme_amp_hash);
1600 		if (te->vme_anon_hash != NULL)
1601 			mod_hash_destroy_hash(te->vme_anon_hash);
1602 		kmem_free(te, sizeof (vmu_entity_t));
1603 	}
1604 	while (vmu_data.vmu_free_zones != NULL) {
1605 		tz = vmu_data.vmu_free_zones;
1606 		vmu_data.vmu_free_zones =
1607 		    vmu_data.vmu_free_zones->vmz_next;
1608 		if (tz->vmz_projects_hash != NULL)
1609 			mod_hash_destroy_hash(tz->vmz_projects_hash);
1610 		if (tz->vmz_tasks_hash != NULL)
1611 			mod_hash_destroy_hash(tz->vmz_tasks_hash);
1612 		if (tz->vmz_rusers_hash != NULL)
1613 			mod_hash_destroy_hash(tz->vmz_rusers_hash);
1614 		if (tz->vmz_eusers_hash != NULL)
1615 			mod_hash_destroy_hash(tz->vmz_eusers_hash);
1616 		kmem_free(tz, sizeof (vmu_zone_t));
1617 	}
1618 }
1619 
1620 extern kcondvar_t *pr_pid_cv;
1621 
1622 /*
1623  * Determine which entity types are relevant and allocate the hashes to
1624  * track them.  Then walk the process table and count rss and swap
1625  * for each process'es address space.  Address space object such as
1626  * vnodes, amps and anons are tracked per entity, so that they are
1627  * not double counted in the results.
1628  *
1629  */
1630 static void
vmu_calculate()1631 vmu_calculate()
1632 {
1633 	int i = 0;
1634 	int ret;
1635 	proc_t *p;
1636 
1637 	vmu_clear_calc();
1638 
1639 	if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1640 		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1641 		    ALL_ZONES);
1642 
1643 	/*
1644 	 * Walk process table and calculate rss of each proc.
1645 	 *
1646 	 * Pidlock and p_lock cannot be held while doing the rss calculation.
1647 	 * This is because:
1648 	 *	1.  The calculation allocates using KM_SLEEP.
1649 	 *	2.  The calculation grabs a_lock, which cannot be grabbed
1650 	 *	    after p_lock.
1651 	 *
1652 	 * Since pidlock must be dropped, we cannot simply just walk the
1653 	 * practive list.  Instead, we walk the process table, and sprlock
1654 	 * each process to ensure that it does not exit during the
1655 	 * calculation.
1656 	 */
1657 
1658 	mutex_enter(&pidlock);
1659 	for (i = 0; i < v.v_proc; i++) {
1660 again:
1661 		p = pid_entry(i);
1662 		if (p == NULL)
1663 			continue;
1664 
1665 		mutex_enter(&p->p_lock);
1666 		mutex_exit(&pidlock);
1667 
1668 		if (panicstr) {
1669 			mutex_exit(&p->p_lock);
1670 			return;
1671 		}
1672 
1673 		/* Try to set P_PR_LOCK */
1674 		ret = sprtrylock_proc(p);
1675 		if (ret == -1) {
1676 			/* Process in invalid state */
1677 			mutex_exit(&p->p_lock);
1678 			mutex_enter(&pidlock);
1679 			continue;
1680 		} else if (ret == 1) {
1681 			/*
1682 			 * P_PR_LOCK is already set.  Wait and try again.
1683 			 * This also drops p_lock.
1684 			 */
1685 			sprwaitlock_proc(p);
1686 			mutex_enter(&pidlock);
1687 			goto again;
1688 		}
1689 		mutex_exit(&p->p_lock);
1690 
1691 		vmu_calculate_proc(p);
1692 
1693 		mutex_enter(&p->p_lock);
1694 		sprunlock(p);
1695 		mutex_enter(&pidlock);
1696 	}
1697 	mutex_exit(&pidlock);
1698 
1699 	vmu_free_extra();
1700 }
1701 
1702 /*
1703  * allocate a new cache for N results satisfying flags
1704  */
1705 vmu_cache_t *
vmu_cache_alloc(size_t nres,uint_t flags)1706 vmu_cache_alloc(size_t nres, uint_t flags)
1707 {
1708 	vmu_cache_t *cache;
1709 
1710 	cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1711 	cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1712 	cache->vmc_nresults = nres;
1713 	cache->vmc_flags = flags;
1714 	cache->vmc_refcnt = 1;
1715 	return (cache);
1716 }
1717 
1718 /*
1719  * Make sure cached results are not freed
1720  */
1721 static void
vmu_cache_hold(vmu_cache_t * cache)1722 vmu_cache_hold(vmu_cache_t *cache)
1723 {
1724 	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1725 	cache->vmc_refcnt++;
1726 }
1727 
1728 /*
1729  * free cache data
1730  */
1731 static void
vmu_cache_rele(vmu_cache_t * cache)1732 vmu_cache_rele(vmu_cache_t *cache)
1733 {
1734 	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1735 	ASSERT(cache->vmc_refcnt > 0);
1736 	cache->vmc_refcnt--;
1737 	if (cache->vmc_refcnt == 0) {
1738 		kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1739 		    cache->vmc_nresults);
1740 		kmem_free(cache, sizeof (vmu_cache_t));
1741 	}
1742 }
1743 
1744 /*
1745  * Copy out the cached results to a caller.  Inspect the callers flags
1746  * and zone to determine which cached results should be copied.
1747  */
1748 static int
vmu_copyout_results(vmu_cache_t * cache,vmusage_t * buf,size_t * nres,uint_t flags,int cpflg)1749 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1750     uint_t flags, int cpflg)
1751 {
1752 	vmusage_t *result, *out_result;
1753 	vmusage_t dummy;
1754 	size_t i, count = 0;
1755 	size_t bufsize;
1756 	int ret = 0;
1757 	uint_t types = 0;
1758 
1759 	if (nres != NULL) {
1760 		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1761 			return (set_errno(EFAULT));
1762 	} else {
1763 		bufsize = 0;
1764 	}
1765 
1766 	/* figure out what results the caller is interested in. */
1767 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1768 		types |= VMUSAGE_SYSTEM;
1769 	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1770 		types |= VMUSAGE_ZONE;
1771 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1772 	    VMUSAGE_COL_PROJECTS))
1773 		types |= VMUSAGE_PROJECTS;
1774 	if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1775 		types |= VMUSAGE_TASKS;
1776 	if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1777 		types |= VMUSAGE_RUSERS;
1778 	if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1779 		types |= VMUSAGE_EUSERS;
1780 
1781 	/* count results for current zone */
1782 	out_result = buf;
1783 	for (result = cache->vmc_results, i = 0;
1784 	    i < cache->vmc_nresults; result++, i++) {
1785 
1786 		/* Do not return "other-zone" results to non-global zones */
1787 		if (curproc->p_zone != global_zone &&
1788 		    curproc->p_zone->zone_id != result->vmu_zoneid)
1789 			continue;
1790 
1791 		/*
1792 		 * If non-global zone requests VMUSAGE_SYSTEM, fake
1793 		 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1794 		 */
1795 		if (curproc->p_zone != global_zone &&
1796 		    (flags & VMUSAGE_SYSTEM) != 0 &&
1797 		    result->vmu_type == VMUSAGE_ZONE) {
1798 			count++;
1799 			if (out_result != NULL) {
1800 				if (bufsize < count) {
1801 					ret = set_errno(EOVERFLOW);
1802 				} else {
1803 					dummy = *result;
1804 					dummy.vmu_zoneid = ALL_ZONES;
1805 					dummy.vmu_id = 0;
1806 					dummy.vmu_type = VMUSAGE_SYSTEM;
1807 					if (ddi_copyout(&dummy, out_result,
1808 					    sizeof (vmusage_t), cpflg))
1809 						return (set_errno(EFAULT));
1810 					out_result++;
1811 				}
1812 			}
1813 		}
1814 
1815 		/* Skip results that do not match requested type */
1816 		if ((result->vmu_type & types) == 0)
1817 			continue;
1818 
1819 		/* Skip collated results if not requested */
1820 		if (result->vmu_zoneid == ALL_ZONES) {
1821 			if (result->vmu_type == VMUSAGE_PROJECTS &&
1822 			    (flags & VMUSAGE_COL_PROJECTS) == 0)
1823 				continue;
1824 			if (result->vmu_type == VMUSAGE_EUSERS &&
1825 			    (flags & VMUSAGE_COL_EUSERS) == 0)
1826 				continue;
1827 			if (result->vmu_type == VMUSAGE_RUSERS &&
1828 			    (flags & VMUSAGE_COL_RUSERS) == 0)
1829 				continue;
1830 		}
1831 
1832 		/* Skip "other zone" results if not requested */
1833 		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1834 			if (result->vmu_type == VMUSAGE_ZONE &&
1835 			    (flags & VMUSAGE_ALL_ZONES) == 0)
1836 				continue;
1837 			if (result->vmu_type == VMUSAGE_PROJECTS &&
1838 			    (flags & (VMUSAGE_ALL_PROJECTS |
1839 			    VMUSAGE_COL_PROJECTS)) == 0)
1840 				continue;
1841 			if (result->vmu_type == VMUSAGE_TASKS &&
1842 			    (flags & VMUSAGE_ALL_TASKS) == 0)
1843 				continue;
1844 			if (result->vmu_type == VMUSAGE_RUSERS &&
1845 			    (flags & (VMUSAGE_ALL_RUSERS |
1846 			    VMUSAGE_COL_RUSERS)) == 0)
1847 				continue;
1848 			if (result->vmu_type == VMUSAGE_EUSERS &&
1849 			    (flags & (VMUSAGE_ALL_EUSERS |
1850 			    VMUSAGE_COL_EUSERS)) == 0)
1851 				continue;
1852 		}
1853 		count++;
1854 		if (out_result != NULL) {
1855 			if (bufsize < count) {
1856 				ret = set_errno(EOVERFLOW);
1857 			} else {
1858 				if (ddi_copyout(result, out_result,
1859 				    sizeof (vmusage_t), cpflg))
1860 					return (set_errno(EFAULT));
1861 				out_result++;
1862 			}
1863 		}
1864 	}
1865 	if (nres != NULL)
1866 		if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1867 			return (set_errno(EFAULT));
1868 
1869 	return (ret);
1870 }
1871 
1872 /*
1873  * vm_getusage()
1874  *
1875  * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1876  * determines the type of results structures returned.  Flags requesting
1877  * results from more than one zone are "flattened" to the local zone if the
1878  * caller is not the global zone.
1879  *
1880  * args:
1881  *	flags:	bitmap consisting of one or more of VMUSAGE_*.
1882  *	age:	maximum allowable age (time since counting was done) in
1883  *		seconds of the results.  Results from previous callers are
1884  *		cached in kernel.
1885  *	buf:	pointer to buffer array of vmusage_t.  If NULL, then only nres
1886  *		set on success.
1887  *	nres:	Set to number of vmusage_t structures pointed to by buf
1888  *		before calling vm_getusage().
1889  *		On return 0 (success) or ENOSPC, is set to the number of result
1890  *		structures returned or attempted to return.
1891  *
1892  * returns 0 on success, -1 on failure:
1893  *	EINTR (interrupted)
1894  *	ENOSPC (nres to small for results, nres set to needed value for success)
1895  *	EINVAL (flags invalid)
1896  *	EFAULT (bad address for buf or nres)
1897  */
1898 int
vm_getusage(uint_t flags,time_t age,vmusage_t * buf,size_t * nres,int cpflg)1899 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1900 {
1901 	vmu_entity_t *entity;
1902 	vmusage_t *result;
1903 	int ret = 0;
1904 	int cacherecent = 0;
1905 	hrtime_t now;
1906 	uint_t flags_orig;
1907 
1908 	/*
1909 	 * Non-global zones cannot request system wide and/or collated
1910 	 * results, or the system result, so munge the flags accordingly.
1911 	 */
1912 	flags_orig = flags;
1913 	if (curproc->p_zone != global_zone) {
1914 		if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1915 			flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1916 			flags |= VMUSAGE_PROJECTS;
1917 		}
1918 		if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1919 			flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1920 			flags |= VMUSAGE_RUSERS;
1921 		}
1922 		if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1923 			flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1924 			flags |= VMUSAGE_EUSERS;
1925 		}
1926 		if (flags & VMUSAGE_SYSTEM) {
1927 			flags &= ~VMUSAGE_SYSTEM;
1928 			flags |= VMUSAGE_ZONE;
1929 		}
1930 	}
1931 
1932 	/* Check for unknown flags */
1933 	if ((flags & (~VMUSAGE_MASK)) != 0)
1934 		return (set_errno(EINVAL));
1935 
1936 	/* Check for no flags */
1937 	if ((flags & VMUSAGE_MASK) == 0)
1938 		return (set_errno(EINVAL));
1939 
1940 	mutex_enter(&vmu_data.vmu_lock);
1941 	now = gethrtime();
1942 
1943 start:
1944 	if (vmu_data.vmu_cache != NULL) {
1945 
1946 		vmu_cache_t *cache;
1947 
1948 		if ((vmu_data.vmu_cache->vmc_timestamp +
1949 		    ((hrtime_t)age * NANOSEC)) > now)
1950 			cacherecent = 1;
1951 
1952 		if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1953 		    cacherecent == 1) {
1954 			cache = vmu_data.vmu_cache;
1955 			vmu_cache_hold(cache);
1956 			mutex_exit(&vmu_data.vmu_lock);
1957 
1958 			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1959 			    cpflg);
1960 			mutex_enter(&vmu_data.vmu_lock);
1961 			vmu_cache_rele(cache);
1962 			if (vmu_data.vmu_pending_waiters > 0)
1963 				cv_broadcast(&vmu_data.vmu_cv);
1964 			mutex_exit(&vmu_data.vmu_lock);
1965 			return (ret);
1966 		}
1967 		/*
1968 		 * If the cache is recent, it is likely that there are other
1969 		 * consumers of vm_getusage running, so add their flags to the
1970 		 * desired flags for the calculation.
1971 		 */
1972 		if (cacherecent == 1)
1973 			flags = vmu_data.vmu_cache->vmc_flags | flags;
1974 	}
1975 	if (vmu_data.vmu_calc_thread == NULL) {
1976 
1977 		vmu_cache_t *cache;
1978 
1979 		vmu_data.vmu_calc_thread = curthread;
1980 		vmu_data.vmu_calc_flags = flags;
1981 		vmu_data.vmu_entities = NULL;
1982 		vmu_data.vmu_nentities = 0;
1983 		if (vmu_data.vmu_pending_waiters > 0)
1984 			vmu_data.vmu_calc_flags |=
1985 			    vmu_data.vmu_pending_flags;
1986 
1987 		vmu_data.vmu_pending_flags = 0;
1988 		mutex_exit(&vmu_data.vmu_lock);
1989 		vmu_calculate();
1990 		mutex_enter(&vmu_data.vmu_lock);
1991 		/* copy results to cache */
1992 		if (vmu_data.vmu_cache != NULL)
1993 			vmu_cache_rele(vmu_data.vmu_cache);
1994 		cache = vmu_data.vmu_cache =
1995 		    vmu_cache_alloc(vmu_data.vmu_nentities,
1996 		    vmu_data.vmu_calc_flags);
1997 
1998 		result = cache->vmc_results;
1999 		for (entity = vmu_data.vmu_entities; entity != NULL;
2000 		    entity = entity->vme_next) {
2001 			*result = entity->vme_result;
2002 			result++;
2003 		}
2004 		cache->vmc_timestamp = gethrtime();
2005 		vmu_cache_hold(cache);
2006 
2007 		vmu_data.vmu_calc_flags = 0;
2008 		vmu_data.vmu_calc_thread = NULL;
2009 
2010 		if (vmu_data.vmu_pending_waiters > 0)
2011 			cv_broadcast(&vmu_data.vmu_cv);
2012 
2013 		mutex_exit(&vmu_data.vmu_lock);
2014 
2015 		/* copy cache */
2016 		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
2017 		mutex_enter(&vmu_data.vmu_lock);
2018 		vmu_cache_rele(cache);
2019 		mutex_exit(&vmu_data.vmu_lock);
2020 
2021 		return (ret);
2022 	}
2023 	vmu_data.vmu_pending_flags |= flags;
2024 	vmu_data.vmu_pending_waiters++;
2025 	while (vmu_data.vmu_calc_thread != NULL) {
2026 		if (cv_wait_sig(&vmu_data.vmu_cv,
2027 		    &vmu_data.vmu_lock) == 0) {
2028 			vmu_data.vmu_pending_waiters--;
2029 			mutex_exit(&vmu_data.vmu_lock);
2030 			return (set_errno(EINTR));
2031 		}
2032 	}
2033 	vmu_data.vmu_pending_waiters--;
2034 	goto start;
2035 }
2036