1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/exacct.h>
26#include <sys/exacct_catalog.h>
27#include <sys/disp.h>
28#include <sys/task.h>
29#include <sys/proc.h>
30#include <sys/cmn_err.h>
31#include <sys/kmem.h>
32#include <sys/project.h>
33#include <sys/systm.h>
34#include <sys/vnode.h>
35#include <sys/file.h>
36#include <sys/acctctl.h>
37#include <sys/time.h>
38#include <sys/utsname.h>
39#include <sys/session.h>
40#include <sys/sysmacros.h>
41#include <sys/bitmap.h>
42#include <sys/msacct.h>
43
44/*
45 * exacct usage and recording routines
46 *
47 * wracct(2), getacct(2), and the records written at process or task
48 * termination are constructed using the exacct_assemble_[task,proc]_usage()
49 * functions, which take a callback that takes the appropriate action on
50 * the packed exacct record for the task or process.  For the process-related
51 * actions, we partition the routines such that the data collecting component
52 * can be performed while holding p_lock, and all sleeping or blocking
53 * operations can be performed without acquiring p_lock.
54 *
55 * putacct(2), which allows an application to construct a customized record
56 * associated with an existing process or task, has its own entry points:
57 * exacct_tag_task() and exacct_tag_proc().
58 */
59
60taskq_t *exacct_queue;
61kmem_cache_t *exacct_object_cache;
62
63zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
64
65static const uint32_t exacct_version = EXACCT_VERSION;
66static const char exacct_header[] = "exacct";
67static const char exacct_creator[] = "SunOS";
68
69ea_object_t *
70ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
71{
72	ea_object_t *item;
73
74	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
75	bzero(item, sizeof (ea_object_t));
76	(void) ea_set_item(item, catalog, buf, bufsz);
77	return (item);
78}
79
80ea_object_t *
81ea_alloc_group(ea_catalog_t catalog)
82{
83	ea_object_t *group;
84
85	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
86	bzero(group, sizeof (ea_object_t));
87	(void) ea_set_group(group, catalog);
88	return (group);
89}
90
91ea_object_t *
92ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
93{
94	ea_object_t *item;
95
96	item = ea_alloc_item(catalog, buf, bufsz);
97	(void) ea_attach_to_group(grp, item);
98	return (item);
99}
100
101/*
102 * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
103 * microstate accounting data and resource usage counters from one task_usage_t
104 * from those supplied in another. These functions do not operate on *all*
105 * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
106 * sense.
107 */
108static void
109exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
110{
111	tu->tu_utime  += delta->tu_utime;
112	tu->tu_stime  += delta->tu_stime;
113	tu->tu_minflt += delta->tu_minflt;
114	tu->tu_majflt += delta->tu_majflt;
115	tu->tu_sndmsg += delta->tu_sndmsg;
116	tu->tu_rcvmsg += delta->tu_rcvmsg;
117	tu->tu_ioch   += delta->tu_ioch;
118	tu->tu_iblk   += delta->tu_iblk;
119	tu->tu_oblk   += delta->tu_oblk;
120	tu->tu_vcsw   += delta->tu_vcsw;
121	tu->tu_icsw   += delta->tu_icsw;
122	tu->tu_nsig   += delta->tu_nsig;
123	tu->tu_nswp   += delta->tu_nswp;
124	tu->tu_nscl   += delta->tu_nscl;
125}
126
127/*
128 * See the comments for exacct_add_task_mstate(), above.
129 */
130static void
131exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
132{
133	tu->tu_utime  -= delta->tu_utime;
134	tu->tu_stime  -= delta->tu_stime;
135	tu->tu_minflt -= delta->tu_minflt;
136	tu->tu_majflt -= delta->tu_majflt;
137	tu->tu_sndmsg -= delta->tu_sndmsg;
138	tu->tu_rcvmsg -= delta->tu_rcvmsg;
139	tu->tu_ioch   -= delta->tu_ioch;
140	tu->tu_iblk   -= delta->tu_iblk;
141	tu->tu_oblk   -= delta->tu_oblk;
142	tu->tu_vcsw   -= delta->tu_vcsw;
143	tu->tu_icsw   -= delta->tu_icsw;
144	tu->tu_nsig   -= delta->tu_nsig;
145	tu->tu_nswp   -= delta->tu_nswp;
146	tu->tu_nscl   -= delta->tu_nscl;
147}
148
149/*
150 * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header()
151 * to write to the accounting file without corrupting it in case of an I/O or
152 * filesystem error.
153 */
154static int
155exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize)
156{
157	int error;
158	ssize_t resid;
159	struct vattr va;
160
161	ASSERT(info != NULL);
162	ASSERT(info->ac_vnode != NULL);
163	ASSERT(MUTEX_HELD(&info->ac_lock));
164
165	/*
166	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
167	 * the present accounting file.
168	 */
169	va.va_mask = AT_SIZE;
170	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL);
171	if (error == 0) {
172		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
173		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
174		    kcred, &resid);
175		if (error) {
176			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
177		} else if (resid != 0) {
178			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
179			error = ENOSPC;
180		}
181	}
182	return (error);
183}
184
185/*
186 * exacct_vn_write() safely writes to an accounting file.  acctctl() prevents
187 * the two accounting vnodes from being equal, and the appropriate ac_lock is
188 * held across the call, so we're single threaded through this code for each
189 * file.
190 */
191static int
192exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
193{
194	int error;
195
196	if (info == NULL)
197		return (0);
198
199	mutex_enter(&info->ac_lock);
200
201	/*
202	 * Don't do anything unless accounting file is set.
203	 */
204	if (info->ac_vnode == NULL) {
205		mutex_exit(&info->ac_lock);
206		return (0);
207	}
208	error = exacct_vn_write_impl(info, buf, bufsize);
209	mutex_exit(&info->ac_lock);
210
211	return (error);
212}
213
214/*
215 * void *exacct_create_header(size_t *)
216 *
217 * Overview
218 *   exacct_create_header() constructs an exacct file header identifying the
219 *   accounting file as the output of the kernel.  exacct_create_header() and
220 *   the static write_header() and verify_header() routines in libexacct must
221 *   remain synchronized.
222 *
223 * Return values
224 *   A pointer to a packed exacct buffer containing the appropriate header is
225 *   returned; the size of the buffer is placed in the location indicated by
226 *   sizep.
227 *
228 * Caller's context
229 *   Suitable for KM_SLEEP allocations.
230 */
231void *
232exacct_create_header(size_t *sizep)
233{
234	ea_object_t *hdr_grp;
235	uint32_t bskip;
236	void *buf;
237	size_t bufsize;
238
239	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
240	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
241	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
242	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
243	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
244	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
245	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
246	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
247	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
248
249	bufsize = ea_pack_object(hdr_grp, NULL, 0);
250	buf = kmem_alloc(bufsize, KM_SLEEP);
251	(void) ea_pack_object(hdr_grp, buf, bufsize);
252	ea_free_object(hdr_grp, EUP_ALLOC);
253
254	/*
255	 * To prevent reading the header when reading the file backwards,
256	 * set the large backskip of the header group to 0 (last 4 bytes).
257	 */
258	bskip = 0;
259	exacct_order32(&bskip);
260	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
261	    sizeof (bskip));
262
263	*sizep = bufsize;
264	return (buf);
265}
266
267/*
268 * int exacct_write_header(ac_info_t *, void *, size_t)
269 *
270 * Overview
271 *   exacct_write_header() writes the given header buffer to the indicated
272 *   vnode.
273 *
274 * Return values
275 *   The result of the write operation is returned.
276 *
277 * Caller's context
278 *   Caller must hold the ac_lock of the appropriate accounting file
279 *   information block (ac_info_t).
280 */
281int
282exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
283{
284	if (info != NULL && info->ac_vnode != NULL)
285		return (exacct_vn_write_impl(info, hdr, hdrsize));
286
287	return (0);
288}
289
290static void
291exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
292    task_usage_t **tu_buf)
293{
294	task_usage_t *oldtu, *newtu;
295	task_usage_t **prevusage;
296
297	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
298	if (getzoneid() != GLOBAL_ZONEID) {
299		prevusage = &tk->tk_zoneusage;
300	} else {
301		prevusage = &tk->tk_prevusage;
302	}
303	if ((oldtu = *prevusage) != NULL) {
304		/*
305		 * In case we have any accounting information
306		 * saved from the previous interval record.
307		 */
308		newtu = *tu_buf;
309		bcopy(tu, newtu, sizeof (task_usage_t));
310		tu->tu_minflt	-= oldtu->tu_minflt;
311		tu->tu_majflt	-= oldtu->tu_majflt;
312		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
313		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
314		tu->tu_ioch	-= oldtu->tu_ioch;
315		tu->tu_iblk	-= oldtu->tu_iblk;
316		tu->tu_oblk	-= oldtu->tu_oblk;
317		tu->tu_vcsw	-= oldtu->tu_vcsw;
318		tu->tu_icsw	-= oldtu->tu_icsw;
319		tu->tu_nsig	-= oldtu->tu_nsig;
320		tu->tu_nswp	-= oldtu->tu_nswp;
321		tu->tu_nscl	-= oldtu->tu_nscl;
322		tu->tu_utime	-= oldtu->tu_utime;
323		tu->tu_stime	-= oldtu->tu_stime;
324
325		tu->tu_startsec = oldtu->tu_finishsec;
326		tu->tu_startnsec = oldtu->tu_finishnsec;
327		/*
328		 * Copy the data from our temporary storage to the task's
329		 * previous interval usage structure for future reference.
330		 */
331		bcopy(newtu, oldtu, sizeof (task_usage_t));
332	} else {
333		/*
334		 * Store current statistics in the task's previous interval
335		 * usage structure for future references.
336		 */
337		*prevusage = *tu_buf;
338		bcopy(tu, *prevusage, sizeof (task_usage_t));
339		*tu_buf = NULL;
340	}
341}
342
343static void
344exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
345{
346	timestruc_t ts;
347	proc_t *p;
348
349	ASSERT(MUTEX_HELD(&pidlock));
350
351	if ((p = tk->tk_memb_list) == NULL)
352		return;
353
354	/*
355	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
356	 * usage of the potentially many members of the task.  Since we don't
357	 * guarantee exactness, we don't acquire the p_lock of any of the member
358	 * processes.
359	 */
360	do {
361		mutex_enter(&p->p_lock);
362		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
363		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
364		mutex_exit(&p->p_lock);
365		tu->tu_minflt	+= p->p_ru.minflt;
366		tu->tu_majflt	+= p->p_ru.majflt;
367		tu->tu_sndmsg	+= p->p_ru.msgsnd;
368		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
369		tu->tu_ioch	+= p->p_ru.ioch;
370		tu->tu_iblk	+= p->p_ru.inblock;
371		tu->tu_oblk	+= p->p_ru.oublock;
372		tu->tu_vcsw	+= p->p_ru.nvcsw;
373		tu->tu_icsw	+= p->p_ru.nivcsw;
374		tu->tu_nsig	+= p->p_ru.nsignals;
375		tu->tu_nswp	+= p->p_ru.nswap;
376		tu->tu_nscl	+= p->p_ru.sysc;
377	} while ((p = p->p_tasknext) != tk->tk_memb_list);
378
379	/*
380	 * The resource usage accounted for so far will include that
381	 * contributed by the task's first process. If this process
382	 * came from another task, then its accumulated resource usage
383	 * will include a contribution from work performed there.
384	 * We must therefore subtract any resource usage that was
385	 * inherited with the first process.
386	 */
387	exacct_sub_task_mstate(tu, tk->tk_inherited);
388
389	gethrestime(&ts);
390	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
391	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
392}
393
394/*
395 * void exacct_update_task_mstate(proc_t *)
396 *
397 * Overview
398 *   exacct_update_task_mstate() updates the task usage; it is intended
399 *   to be called from proc_exit().
400 *
401 * Return values
402 *   None.
403 *
404 * Caller's context
405 *   p_lock must be held at entry.
406 */
407void
408exacct_update_task_mstate(proc_t *p)
409{
410	task_usage_t *tu;
411
412	mutex_enter(&p->p_task->tk_usage_lock);
413	tu = p->p_task->tk_usage;
414	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
415	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
416	tu->tu_minflt	+= p->p_ru.minflt;
417	tu->tu_majflt	+= p->p_ru.majflt;
418	tu->tu_sndmsg	+= p->p_ru.msgsnd;
419	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
420	tu->tu_ioch	+= p->p_ru.ioch;
421	tu->tu_iblk	+= p->p_ru.inblock;
422	tu->tu_oblk	+= p->p_ru.oublock;
423	tu->tu_vcsw	+= p->p_ru.nvcsw;
424	tu->tu_icsw	+= p->p_ru.nivcsw;
425	tu->tu_nsig	+= p->p_ru.nsignals;
426	tu->tu_nswp	+= p->p_ru.nswap;
427	tu->tu_nscl	+= p->p_ru.sysc;
428	mutex_exit(&p->p_task->tk_usage_lock);
429}
430
431static void
432exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
433{
434	timestruc_t ts;
435	task_usage_t *tu_buf;
436
437	switch (flag) {
438	case EW_PARTIAL:
439		/*
440		 * For partial records we must report the sum of current
441		 * accounting statistics with previously accumulated
442		 * statistics.
443		 */
444		mutex_enter(&pidlock);
445		mutex_enter(&tk->tk_usage_lock);
446
447		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
448		exacct_snapshot_task_usage(tk, tu);
449
450		mutex_exit(&tk->tk_usage_lock);
451		mutex_exit(&pidlock);
452		break;
453	case EW_INTERVAL:
454		/*
455		 * We need to allocate spare task_usage_t buffer before
456		 * grabbing pidlock because we might need it later in
457		 * exacct_get_interval_task_usage().
458		 */
459		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
460		mutex_enter(&pidlock);
461		mutex_enter(&tk->tk_usage_lock);
462
463		/*
464		 * For interval records, we deduct the previous microstate
465		 * accounting data and cpu usage times from previously saved
466		 * results and update the previous task usage structure.
467		 */
468		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
469		exacct_snapshot_task_usage(tk, tu);
470		exacct_get_interval_task_usage(tk, tu, &tu_buf);
471
472		mutex_exit(&tk->tk_usage_lock);
473		mutex_exit(&pidlock);
474
475		if (tu_buf != NULL)
476			kmem_free(tu_buf, sizeof (task_usage_t));
477		break;
478	case EW_FINAL:
479		/*
480		 * For final records, we deduct, from the task's current
481		 * usage, any usage that was inherited with the arrival
482		 * of a process from a previous task. We then record
483		 * the task's finish time.
484		 */
485		mutex_enter(&tk->tk_usage_lock);
486		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
487		exacct_sub_task_mstate(tu, tk->tk_inherited);
488		mutex_exit(&tk->tk_usage_lock);
489
490		gethrestime(&ts);
491		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
492		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
493
494		break;
495	}
496}
497
498static int
499exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
500    int res)
501{
502	int attached = 1;
503
504	switch (res) {
505	case AC_TASK_TASKID:
506		(void) ea_attach_item(record, &tk->tk_tkid,
507		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
508		break;
509	case AC_TASK_PROJID:
510		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
511		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
512		break;
513	case AC_TASK_CPU: {
514			timestruc_t ts;
515			uint64_t ui;
516
517			hrt2ts(tu->tu_stime, &ts);
518			ui = ts.tv_sec;
519			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
520			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
521			ui = ts.tv_nsec;
522			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
523			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
524
525			hrt2ts(tu->tu_utime, &ts);
526			ui = ts.tv_sec;
527			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
528			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
529			ui = ts.tv_nsec;
530			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
531			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
532		}
533		break;
534	case AC_TASK_TIME:
535		(void) ea_attach_item(record, &tu->tu_startsec,
536		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
537		(void) ea_attach_item(record, &tu->tu_startnsec,
538		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
539		(void) ea_attach_item(record, &tu->tu_finishsec,
540		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
541		(void) ea_attach_item(record, &tu->tu_finishnsec,
542		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
543		break;
544	case AC_TASK_HOSTNAME:
545		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
546		    strlen(tk->tk_zone->zone_nodename) + 1,
547		    EXT_STRING | EXD_TASK_HOSTNAME);
548			break;
549	case AC_TASK_MICROSTATE:
550		(void) ea_attach_item(record, &tu->tu_majflt,
551		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
552		(void) ea_attach_item(record, &tu->tu_minflt,
553		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
554		(void) ea_attach_item(record, &tu->tu_sndmsg,
555		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
556		(void) ea_attach_item(record, &tu->tu_rcvmsg,
557		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
558		(void) ea_attach_item(record, &tu->tu_iblk,
559		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
560		(void) ea_attach_item(record, &tu->tu_oblk,
561		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
562		(void) ea_attach_item(record, &tu->tu_ioch,
563		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
564		(void) ea_attach_item(record, &tu->tu_vcsw,
565		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
566		(void) ea_attach_item(record, &tu->tu_icsw,
567		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
568		(void) ea_attach_item(record, &tu->tu_nsig,
569		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
570		(void) ea_attach_item(record, &tu->tu_nswp,
571		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
572		(void) ea_attach_item(record, &tu->tu_nscl,
573		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
574		break;
575	case AC_TASK_ANCTASKID:
576		(void) ea_attach_item(record, &tu->tu_anctaskid,
577		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
578		break;
579	case AC_TASK_ZONENAME:
580		(void) ea_attach_item(record, tk->tk_zone->zone_name,
581		    strlen(tk->tk_zone->zone_name) + 1,
582		    EXT_STRING | EXD_TASK_ZONENAME);
583		break;
584	default:
585		attached = 0;
586	}
587	return (attached);
588}
589
590static ea_object_t *
591exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
592    ea_catalog_t record_type)
593{
594	int res, count;
595	ea_object_t *record;
596
597	/*
598	 * Assemble usage values into group.
599	 */
600	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
601	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
602		if (BT_TEST(mask, res))
603			count += exacct_attach_task_item(tk, tu, record, res);
604	if (count == 0) {
605		ea_free_object(record, EUP_ALLOC);
606		record = NULL;
607	}
608	return (record);
609}
610
611/*
612 * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
613 *	size_t, size_t *), void *, size_t, size_t *, int)
614 *
615 * Overview
616 *   exacct_assemble_task_usage() builds the packed exacct buffer for the
617 *   indicated task, executes the given callback function, and free the packed
618 *   buffer.
619 *
620 * Return values
621 *   Returns 0 on success; otherwise the appropriate error code is returned.
622 *
623 * Caller's context
624 *   Suitable for KM_SLEEP allocations.
625 */
626int
627exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
628    int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
629    void *ubuf, size_t ubufsize, size_t *actual, int flag)
630{
631	ulong_t mask[AC_MASK_SZ];
632	ea_object_t *task_record;
633	ea_catalog_t record_type;
634	task_usage_t *tu;
635	void *buf;
636	size_t bufsize;
637	int ret;
638
639	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
640
641	mutex_enter(&ac_task->ac_lock);
642	if (ac_task->ac_state == AC_OFF) {
643		mutex_exit(&ac_task->ac_lock);
644		return (ENOTACTIVE);
645	}
646	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
647	mutex_exit(&ac_task->ac_lock);
648
649	switch (flag) {
650	case EW_FINAL:
651		record_type = EXD_GROUP_TASK;
652		break;
653	case EW_PARTIAL:
654		record_type = EXD_GROUP_TASK_PARTIAL;
655		break;
656	case EW_INTERVAL:
657		record_type = EXD_GROUP_TASK_INTERVAL;
658		break;
659	default:
660		return (0);
661	}
662
663	/*
664	 * Calculate task usage and assemble it into the task record.
665	 */
666	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
667	exacct_calculate_task_usage(tk, tu, flag);
668	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
669	if (task_record == NULL) {
670		/*
671		 * The current configuration of the accounting system has
672		 * resulted in records with no data; accordingly, we don't write
673		 * these, but we return success.
674		 */
675		kmem_free(tu, sizeof (task_usage_t));
676		return (0);
677	}
678
679	/*
680	 * Pack object into buffer and run callback on it.
681	 */
682	bufsize = ea_pack_object(task_record, NULL, 0);
683	buf = kmem_alloc(bufsize, KM_SLEEP);
684	(void) ea_pack_object(task_record, buf, bufsize);
685	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
686
687	/*
688	 * Free all previously allocated structures.
689	 */
690	kmem_free(buf, bufsize);
691	ea_free_object(task_record, EUP_ALLOC);
692	kmem_free(tu, sizeof (task_usage_t));
693	return (ret);
694}
695
696/*
697 * void exacct_commit_task(void *)
698 *
699 * Overview
700 *   exacct_commit_task() calculates the final usage for a task, updating the
701 *   task usage if task accounting is active, and writing a task record if task
702 *   accounting is active.  exacct_commit_task() is intended for being called
703 *   from a task queue (taskq_t).
704 *
705 * Return values
706 *   None.
707 *
708 * Caller's context
709 *   Suitable for KM_SLEEP allocations.
710 */
711
712void
713exacct_commit_task(void *arg)
714{
715	task_t *tk = (task_t *)arg;
716	size_t size;
717	zone_t *zone = tk->tk_zone;
718	struct exacct_globals *acg;
719
720	ASSERT(tk != task0p);
721	ASSERT(tk->tk_memb_list == NULL);
722
723	/*
724	 * Don't do any extra work if the acctctl module isn't loaded.
725	 * If acctctl module is loaded when zone is in down state then
726	 * zone_getspecific can return NULL for that zone.
727	 */
728	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
729		acg = zone_getspecific(exacct_zone_key, zone);
730		if (acg == NULL)
731			goto err;
732		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
733		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
734		if (tk->tk_zone != global_zone) {
735			acg = zone_getspecific(exacct_zone_key, global_zone);
736			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
737			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
738		}
739	}
740	/*
741	 * Release associated project and finalize task.
742	 */
743err:
744	task_end(tk);
745}
746
747static int
748exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
749{
750	int attached = 1;
751
752	switch (res) {
753	case AC_PROC_PID:
754		(void) ea_attach_item(record, &pu->pu_pid,
755		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
756		break;
757	case AC_PROC_UID:
758		(void) ea_attach_item(record, &pu->pu_ruid,
759		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
760		break;
761	case AC_PROC_FLAG:
762		(void) ea_attach_item(record, &pu->pu_acflag,
763		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
764		break;
765	case AC_PROC_GID:
766		(void) ea_attach_item(record, &pu->pu_rgid,
767		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
768		break;
769	case AC_PROC_PROJID:
770		(void) ea_attach_item(record, &pu->pu_projid,
771		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
772		break;
773	case AC_PROC_TASKID:
774		(void) ea_attach_item(record, &pu->pu_taskid,
775		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
776		break;
777	case AC_PROC_CPU:
778		(void) ea_attach_item(record, &pu->pu_utimesec,
779		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
780		(void) ea_attach_item(record, &pu->pu_utimensec,
781		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
782		(void) ea_attach_item(record, &pu->pu_stimesec,
783		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
784		(void) ea_attach_item(record, &pu->pu_stimensec,
785		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
786		break;
787	case AC_PROC_TIME:
788		(void) ea_attach_item(record, &pu->pu_startsec,
789		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
790		(void) ea_attach_item(record, &pu->pu_startnsec,
791		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
792		(void) ea_attach_item(record, &pu->pu_finishsec,
793		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
794		(void) ea_attach_item(record, &pu->pu_finishnsec,
795		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
796		break;
797	case AC_PROC_COMMAND:
798		(void) ea_attach_item(record, pu->pu_command,
799		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
800		break;
801	case AC_PROC_HOSTNAME:
802		(void) ea_attach_item(record, pu->pu_nodename,
803		    strlen(pu->pu_nodename) + 1,
804		    EXT_STRING | EXD_PROC_HOSTNAME);
805		break;
806	case AC_PROC_TTY:
807		(void) ea_attach_item(record, &pu->pu_major,
808		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
809		(void) ea_attach_item(record, &pu->pu_minor,
810		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
811		break;
812	case AC_PROC_MICROSTATE:
813		(void) ea_attach_item(record, &pu->pu_majflt,
814		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
815		(void) ea_attach_item(record, &pu->pu_minflt,
816		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
817		(void) ea_attach_item(record, &pu->pu_sndmsg,
818		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
819		(void) ea_attach_item(record, &pu->pu_rcvmsg,
820		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
821		(void) ea_attach_item(record, &pu->pu_iblk,
822		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
823		(void) ea_attach_item(record, &pu->pu_oblk,
824		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
825		(void) ea_attach_item(record, &pu->pu_ioch,
826		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
827		(void) ea_attach_item(record, &pu->pu_vcsw,
828		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
829		(void) ea_attach_item(record, &pu->pu_icsw,
830		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
831		(void) ea_attach_item(record, &pu->pu_nsig,
832		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
833		(void) ea_attach_item(record, &pu->pu_nswp,
834		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
835		(void) ea_attach_item(record, &pu->pu_nscl,
836		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
837		break;
838	case AC_PROC_ANCPID:
839		(void) ea_attach_item(record, &pu->pu_ancpid,
840		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
841		break;
842	case AC_PROC_WAIT_STATUS:
843		(void) ea_attach_item(record, &pu->pu_wstat,
844		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
845		break;
846	case AC_PROC_ZONENAME:
847		(void) ea_attach_item(record, pu->pu_zonename,
848		    strlen(pu->pu_zonename) + 1,
849		    EXT_STRING | EXD_PROC_ZONENAME);
850		break;
851	case AC_PROC_MEM:
852		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
853		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
854		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
855		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
856		break;
857	default:
858		attached = 0;
859	}
860	return (attached);
861}
862
863static ea_object_t *
864exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
865    ea_catalog_t record_type)
866{
867	int res, count;
868	ea_object_t *record;
869
870	/*
871	 * Assemble usage values into group.
872	 */
873	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
874	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
875		if (BT_TEST(mask, res))
876			count += exacct_attach_proc_item(pu, record, res);
877	if (count == 0) {
878		ea_free_object(record, EUP_ALLOC);
879		record = NULL;
880	}
881	return (record);
882}
883
884/*
885 * The following two routines assume that process's p_lock is held or
886 * exacct_commit_proc has been called from exit() when all lwps are stopped.
887 */
888static void
889exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
890{
891	kthread_t *t;
892
893	ASSERT(MUTEX_HELD(&p->p_lock));
894	if ((t = p->p_tlist) == NULL)
895		return;
896
897	do {
898		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
899		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
900		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
901		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
902		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
903		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
904		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
905		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
906		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
907		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
908		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
909		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
910	} while ((t = t->t_forw) != p->p_tlist);
911}
912
913static void
914exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
915{
916	pu->pu_minflt	= p->p_ru.minflt;
917	pu->pu_majflt	= p->p_ru.majflt;
918	pu->pu_sndmsg	= p->p_ru.msgsnd;
919	pu->pu_rcvmsg	= p->p_ru.msgrcv;
920	pu->pu_ioch	= p->p_ru.ioch;
921	pu->pu_iblk	= p->p_ru.inblock;
922	pu->pu_oblk	= p->p_ru.oublock;
923	pu->pu_vcsw	= p->p_ru.nvcsw;
924	pu->pu_icsw	= p->p_ru.nivcsw;
925	pu->pu_nsig	= p->p_ru.nsignals;
926	pu->pu_nswp	= p->p_ru.nswap;
927	pu->pu_nscl	= p->p_ru.sysc;
928}
929
930void
931exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
932    int flag, int wstat)
933{
934	timestruc_t ts, ts_run;
935
936	ASSERT(MUTEX_HELD(&p->p_lock));
937
938	/*
939	 * Convert CPU and execution times to sec/nsec format.
940	 */
941	if (BT_TEST(mask, AC_PROC_CPU)) {
942		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
943		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
944		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
945		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
946		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
947		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
948	}
949	if (BT_TEST(mask, AC_PROC_TIME)) {
950		gethrestime(&ts);
951		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
952		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
953		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
954		ts.tv_sec -= ts_run.tv_sec;
955		ts.tv_nsec -= ts_run.tv_nsec;
956		if (ts.tv_nsec < 0) {
957			ts.tv_sec--;
958			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
959				ts.tv_sec++;
960				ts.tv_nsec -= NANOSEC;
961			}
962		}
963		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
964		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
965	}
966
967	pu->pu_pid = p->p_pidp->pid_id;
968	pu->pu_acflag = p->p_user.u_acflag;
969	pu->pu_projid = p->p_task->tk_proj->kpj_id;
970	pu->pu_taskid = p->p_task->tk_tkid;
971	pu->pu_major = getmajor(p->p_sessp->s_dev);
972	pu->pu_minor = getminor(p->p_sessp->s_dev);
973	pu->pu_ancpid = p->p_ancpid;
974	pu->pu_wstat = wstat;
975	/*
976	 * Compute average RSS in K.  The denominator is the number of
977	 * samples:  the number of clock ticks plus the initial value.
978	 */
979	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
980	    (PAGESIZE / 1024);
981	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
982
983	mutex_enter(&p->p_crlock);
984	pu->pu_ruid = crgetruid(p->p_cred);
985	pu->pu_rgid = crgetrgid(p->p_cred);
986	mutex_exit(&p->p_crlock);
987
988	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
989	bcopy(p->p_zone->zone_name, pu->pu_zonename,
990	    strlen(p->p_zone->zone_name) + 1);
991	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
992	    strlen(p->p_zone->zone_nodename) + 1);
993
994	/*
995	 * Calculate microstate accounting data for a process that is still
996	 * running.  Presently, we explicitly collect all of the LWP usage into
997	 * the proc usage structure here.
998	 */
999	if (flag & EW_PARTIAL)
1000		exacct_calculate_proc_mstate(p, pu);
1001	if (flag & EW_FINAL)
1002		exacct_copy_proc_mstate(p, pu);
1003}
1004
1005/*
1006 * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
1007 *	*, size_t, size_t *), void *, size_t, size_t *)
1008 *
1009 * Overview
1010 *   Assemble record with miscellaneous accounting information about the process
1011 *   and execute the callback on it. It is the callback's job to set "actual" to
1012 *   the size of record.
1013 *
1014 * Return values
1015 *   The result of the callback function, unless the extended process accounting
1016 *   feature is not active, in which case ENOTACTIVE is returned.
1017 *
1018 * Caller's context
1019 *   Suitable for KM_SLEEP allocations.
1020 */
1021int
1022exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
1023    int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1024    void *ubuf, size_t ubufsize, size_t *actual, int flag)
1025{
1026	ulong_t mask[AC_MASK_SZ];
1027	ea_object_t *proc_record;
1028	ea_catalog_t record_type;
1029	void *buf;
1030	size_t bufsize;
1031	int ret;
1032
1033	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
1034
1035	mutex_enter(&ac_proc->ac_lock);
1036	if (ac_proc->ac_state == AC_OFF) {
1037		mutex_exit(&ac_proc->ac_lock);
1038		return (ENOTACTIVE);
1039	}
1040	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1041	mutex_exit(&ac_proc->ac_lock);
1042
1043	switch (flag) {
1044	case EW_FINAL:
1045		record_type = EXD_GROUP_PROC;
1046		break;
1047	case EW_PARTIAL:
1048		record_type = EXD_GROUP_PROC_PARTIAL;
1049		break;
1050	default:
1051		record_type = EXD_NONE;
1052		break;
1053	}
1054
1055	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
1056	if (proc_record == NULL)
1057		return (0);
1058
1059	/*
1060	 * Pack object into buffer and pass to callback.
1061	 */
1062	bufsize = ea_pack_object(proc_record, NULL, 0);
1063	buf = kmem_alloc(bufsize, KM_SLEEP);
1064	(void) ea_pack_object(proc_record, buf, bufsize);
1065
1066	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
1067
1068	/*
1069	 * Free all previously allocations.
1070	 */
1071	kmem_free(buf, bufsize);
1072	ea_free_object(proc_record, EUP_ALLOC);
1073	return (ret);
1074}
1075
1076/*
1077 * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
1078 * 	size_t *)
1079 *
1080 * Overview
1081 *   exacct_commit_callback() writes the indicated buffer to the indicated
1082 *   extended accounting file.
1083 *
1084 * Return values
1085 *   The result of the write operation is returned.  "actual" is updated to
1086 *   contain the number of bytes actually written.
1087 *
1088 * Caller's context
1089 *   Suitable for a vn_rdwr() operation.
1090 */
1091/*ARGSUSED*/
1092int
1093exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1094    void *buf, size_t bufsize, size_t *actual)
1095{
1096	int error = 0;
1097
1098	*actual = 0;
1099	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1100		*actual = bufsize;
1101	return (error);
1102}
1103
1104static void
1105exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1106{
1107	size_t size;
1108	proc_usage_t *pu;
1109	ulong_t mask[AC_MASK_SZ];
1110
1111	mutex_enter(&ac_proc->ac_lock);
1112	if (ac_proc->ac_state == AC_ON) {
1113		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1114		mutex_exit(&ac_proc->ac_lock);
1115	} else {
1116		mutex_exit(&ac_proc->ac_lock);
1117		return;
1118	}
1119
1120	mutex_enter(&p->p_lock);
1121	size = strlen(p->p_user.u_comm) + 1;
1122	mutex_exit(&p->p_lock);
1123
1124	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1125	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1126	mutex_enter(&p->p_lock);
1127	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1128	mutex_exit(&p->p_lock);
1129
1130	(void) exacct_assemble_proc_usage(ac_proc, pu,
1131	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1132
1133	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1134	kmem_free(pu, sizeof (proc_usage_t));
1135}
1136
1137/*
1138 * void exacct_commit_proc(proc_t *, int)
1139 *
1140 * Overview
1141 *   exacct_commit_proc() calculates the final usage for a process, updating the
1142 *   task usage if task accounting is active, and writing a process record if
1143 *   process accounting is active.  exacct_commit_proc() is intended for being
1144 *   called from proc_exit().
1145 *
1146 * Return values
1147 *   None.
1148 *
1149 * Caller's context
1150 *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1151 */
1152void
1153exacct_commit_proc(proc_t *p, int wstat)
1154{
1155	zone_t *zone = p->p_zone;
1156	struct exacct_globals *acg, *gacg = NULL;
1157
1158	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1159		/*
1160		 * acctctl module not loaded.  Nothing to do.
1161		 */
1162		return;
1163	}
1164
1165	/*
1166	 * If acctctl module is loaded when zone is in down state then
1167	 * zone_getspecific can return NULL for that zone.
1168	 */
1169	acg = zone_getspecific(exacct_zone_key, zone);
1170	if (acg == NULL)
1171		return;
1172	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1173	if (zone != global_zone) {
1174		gacg = zone_getspecific(exacct_zone_key, global_zone);
1175		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1176	}
1177}
1178
1179static int
1180exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
1181{
1182	int		attached = 1;
1183
1184	switch (res) {
1185	case AC_NET_NAME:
1186		(void) ea_attach_item(record, ns->ns_name,
1187		    strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
1188		break;
1189	case AC_NET_CURTIME:
1190		{
1191			uint64_t	now;
1192			timestruc_t	ts;
1193
1194			gethrestime(&ts);
1195			now = (uint64_t)(ulong_t)ts.tv_sec;
1196			(void) ea_attach_item(record,  &now, sizeof (uint64_t),
1197			    EXT_UINT64 | EXD_NET_STATS_CURTIME);
1198		}
1199		break;
1200	case AC_NET_IBYTES:
1201		(void) ea_attach_item(record, &ns->ns_ibytes,
1202		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
1203		break;
1204	case AC_NET_OBYTES:
1205		(void) ea_attach_item(record, &ns->ns_obytes,
1206		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
1207		break;
1208	case AC_NET_IPKTS:
1209		(void) ea_attach_item(record, &ns->ns_ipackets,
1210		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
1211		break;
1212	case AC_NET_OPKTS:
1213		(void) ea_attach_item(record, &ns->ns_opackets,
1214		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
1215		break;
1216	case AC_NET_IERRPKTS:
1217		(void) ea_attach_item(record, &ns->ns_ierrors,
1218		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
1219		break;
1220	case AC_NET_OERRPKTS:
1221		(void) ea_attach_item(record, &ns->ns_oerrors,
1222		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
1223		break;
1224	default:
1225		attached = 0;
1226	}
1227	return (attached);
1228}
1229
1230static int
1231exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
1232{
1233	int attached = 1;
1234
1235	switch (res) {
1236	case AC_NET_NAME:
1237		(void) ea_attach_item(record, nd->nd_name,
1238		    strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
1239		break;
1240	case AC_NET_DEVNAME:
1241		(void) ea_attach_item(record, nd->nd_devname,
1242		    strlen(nd->nd_devname) + 1, EXT_STRING |
1243		    EXD_NET_DESC_DEVNAME);
1244		break;
1245	case AC_NET_EHOST:
1246		(void) ea_attach_item(record, &nd->nd_ehost,
1247		    sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
1248		break;
1249	case AC_NET_EDEST:
1250		(void) ea_attach_item(record, &nd->nd_edest,
1251		    sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
1252		break;
1253	case AC_NET_VLAN_TPID:
1254		(void) ea_attach_item(record, &nd->nd_vlan_tpid,
1255		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
1256		break;
1257	case AC_NET_VLAN_TCI:
1258		(void) ea_attach_item(record, &nd->nd_vlan_tci,
1259		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
1260		break;
1261	case AC_NET_SAP:
1262		(void) ea_attach_item(record, &nd->nd_sap,
1263		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
1264		break;
1265	case AC_NET_PRIORITY:
1266		(void) ea_attach_item(record, &nd->nd_priority,
1267		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
1268		break;
1269	case AC_NET_BWLIMIT:
1270		(void) ea_attach_item(record, &nd->nd_bw_limit,
1271		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
1272		break;
1273	case AC_NET_SADDR:
1274		if (nd->nd_isv4) {
1275			(void) ea_attach_item(record, &nd->nd_saddr[3],
1276			    sizeof (uint32_t), EXT_UINT32 |
1277			    EXD_NET_DESC_V4SADDR);
1278		} else {
1279			(void) ea_attach_item(record, &nd->nd_saddr,
1280			    sizeof (nd->nd_saddr), EXT_RAW |
1281			    EXD_NET_DESC_V6SADDR);
1282		}
1283		break;
1284	case AC_NET_DADDR:
1285		if (nd->nd_isv4) {
1286			(void) ea_attach_item(record, &nd->nd_daddr[3],
1287			    sizeof (uint32_t), EXT_UINT32 |
1288			    EXD_NET_DESC_V4DADDR);
1289		} else {
1290			(void) ea_attach_item(record, &nd->nd_daddr,
1291			    sizeof (nd->nd_daddr), EXT_RAW |
1292			    EXD_NET_DESC_V6DADDR);
1293		}
1294		break;
1295	case AC_NET_SPORT:
1296		(void) ea_attach_item(record, &nd->nd_sport,
1297		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
1298		break;
1299	case AC_NET_DPORT:
1300		(void) ea_attach_item(record, &nd->nd_dport,
1301		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
1302		break;
1303	case AC_NET_PROTOCOL:
1304		(void) ea_attach_item(record, &nd->nd_protocol,
1305		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
1306		break;
1307	case AC_NET_DSFIELD:
1308		(void) ea_attach_item(record, &nd->nd_dsfield,
1309		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
1310		break;
1311	default:
1312		attached = 0;
1313	}
1314	return (attached);
1315}
1316
1317static ea_object_t *
1318exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
1319    int what)
1320{
1321	int		res;
1322	int		count;
1323	ea_object_t	*record;
1324
1325	/*
1326	 * Assemble usage values into group.
1327	 */
1328	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1329	for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
1330		if (BT_TEST(mask, res)) {
1331			if (what == EX_NET_LNDESC_REC ||
1332			    what == EX_NET_FLDESC_REC) {
1333				count += exacct_attach_netdesc_item(
1334				    (net_desc_t *)ninfo, record, res);
1335			} else {
1336				count += exacct_attach_netstat_item(
1337				    (net_stat_t *)ninfo, record, res);
1338			}
1339		}
1340	if (count == 0) {
1341		ea_free_object(record, EUP_ALLOC);
1342		record = NULL;
1343	}
1344	return (record);
1345}
1346
1347int
1348exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
1349    int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1350    void *ubuf, size_t ubufsize, size_t *actual, int what)
1351{
1352	ulong_t		mask[AC_MASK_SZ];
1353	ea_object_t	*net_desc;
1354	ea_catalog_t	record_type;
1355	void		*buf;
1356	size_t		bufsize;
1357	int		ret;
1358
1359	mutex_enter(&ac_net->ac_lock);
1360	if (ac_net->ac_state == AC_OFF) {
1361		mutex_exit(&ac_net->ac_lock);
1362		return (ENOTACTIVE);
1363	}
1364	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1365	mutex_exit(&ac_net->ac_lock);
1366
1367	switch (what) {
1368	case EX_NET_LNDESC_REC:
1369		record_type = EXD_GROUP_NET_LINK_DESC;
1370		break;
1371	case EX_NET_LNSTAT_REC:
1372		record_type = EXD_GROUP_NET_LINK_STATS;
1373		break;
1374	case EX_NET_FLDESC_REC:
1375		record_type = EXD_GROUP_NET_FLOW_DESC;
1376		break;
1377	case EX_NET_FLSTAT_REC:
1378		record_type = EXD_GROUP_NET_FLOW_STATS;
1379		break;
1380	default:
1381		return (0);
1382	}
1383
1384	net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
1385	if (net_desc == NULL)
1386		return (0);
1387
1388	/*
1389	 * Pack object into buffer and pass to callback.
1390	 */
1391	bufsize = ea_pack_object(net_desc, NULL, 0);
1392	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1393	if (buf == NULL)
1394		return (ENOMEM);
1395
1396	(void) ea_pack_object(net_desc, buf, bufsize);
1397
1398	ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
1399
1400	/*
1401	 * Free all previously allocations.
1402	 */
1403	kmem_free(buf, bufsize);
1404	ea_free_object(net_desc, EUP_ALLOC);
1405	return (ret);
1406}
1407
1408int
1409exacct_commit_netinfo(void *arg, int what)
1410{
1411	size_t			size;
1412	ulong_t			mask[AC_MASK_SZ];
1413	struct exacct_globals	*acg;
1414	ac_info_t		*ac_net;
1415
1416	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1417		/*
1418		 * acctctl module not loaded. Nothing to do.
1419		 */
1420		return (ENOTACTIVE);
1421	}
1422
1423	/*
1424	 * Even though each zone nominally has its own flow accounting settings
1425	 * (ac_flow), these are only maintained by and for the global zone.
1426	 *
1427	 * If this were to change in the future, this function should grow a
1428	 * second zoneid (or zone) argument, and use the corresponding zone's
1429	 * settings rather than always using those of the global zone.
1430	 */
1431	acg = zone_getspecific(exacct_zone_key, global_zone);
1432	ac_net = &acg->ac_net;
1433
1434	mutex_enter(&ac_net->ac_lock);
1435	if (ac_net->ac_state == AC_OFF) {
1436		mutex_exit(&ac_net->ac_lock);
1437		return (ENOTACTIVE);
1438	}
1439	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1440	mutex_exit(&ac_net->ac_lock);
1441
1442	return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
1443	    NULL, 0, &size, what));
1444}
1445
1446static int
1447exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1448{
1449	int attached = 1;
1450
1451	switch (res) {
1452	case AC_FLOW_SADDR:
1453		if (fu->fu_isv4) {
1454			(void) ea_attach_item(record, &fu->fu_saddr[3],
1455			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1456		} else {
1457			(void) ea_attach_item(record, &fu->fu_saddr,
1458			    sizeof (fu->fu_saddr), EXT_RAW |
1459			    EXD_FLOW_V6SADDR);
1460		}
1461		break;
1462	case AC_FLOW_DADDR:
1463		if (fu->fu_isv4) {
1464			(void) ea_attach_item(record, &fu->fu_daddr[3],
1465			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1466		} else {
1467			(void) ea_attach_item(record, &fu->fu_daddr,
1468			    sizeof (fu->fu_daddr), EXT_RAW |
1469			    EXD_FLOW_V6DADDR);
1470		}
1471		break;
1472	case AC_FLOW_SPORT:
1473		(void) ea_attach_item(record, &fu->fu_sport,
1474		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1475		break;
1476	case AC_FLOW_DPORT:
1477		(void) ea_attach_item(record, &fu->fu_dport,
1478		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1479		break;
1480	case AC_FLOW_PROTOCOL:
1481		(void) ea_attach_item(record, &fu->fu_protocol,
1482		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1483		break;
1484	case AC_FLOW_DSFIELD:
1485		(void) ea_attach_item(record, &fu->fu_dsfield,
1486		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1487		break;
1488	case AC_FLOW_CTIME:
1489		(void) ea_attach_item(record, &fu->fu_ctime,
1490		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1491		break;
1492	case AC_FLOW_LSEEN:
1493		(void) ea_attach_item(record, &fu->fu_lseen,
1494		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1495		break;
1496	case AC_FLOW_NBYTES:
1497		(void) ea_attach_item(record, &fu->fu_nbytes,
1498		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1499		break;
1500	case AC_FLOW_NPKTS:
1501		(void) ea_attach_item(record, &fu->fu_npackets,
1502		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1503		break;
1504	case AC_FLOW_PROJID:
1505		if (fu->fu_projid >= 0) {
1506			(void) ea_attach_item(record, &fu->fu_projid,
1507			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1508		}
1509		break;
1510	case AC_FLOW_UID:
1511		if (fu->fu_userid >= 0) {
1512			(void) ea_attach_item(record, &fu->fu_userid,
1513			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1514		}
1515		break;
1516	case AC_FLOW_ANAME:
1517		(void) ea_attach_item(record, fu->fu_aname,
1518		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1519		break;
1520	default:
1521		attached = 0;
1522	}
1523	return (attached);
1524}
1525
1526static ea_object_t *
1527exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1528    ea_catalog_t record_type)
1529{
1530	int res, count;
1531	ea_object_t *record;
1532
1533	/*
1534	 * Assemble usage values into group.
1535	 */
1536	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1537	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1538		if (BT_TEST(mask, res))
1539			count += exacct_attach_flow_item(fu, record, res);
1540	if (count == 0) {
1541		ea_free_object(record, EUP_ALLOC);
1542		record = NULL;
1543	}
1544	return (record);
1545}
1546
1547int
1548exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1549    int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1550    void *ubuf, size_t ubufsize, size_t *actual)
1551{
1552	ulong_t mask[AC_MASK_SZ];
1553	ea_object_t *flow_usage;
1554	ea_catalog_t record_type;
1555	void *buf;
1556	size_t bufsize;
1557	int ret;
1558
1559	mutex_enter(&ac_flow->ac_lock);
1560	if (ac_flow->ac_state == AC_OFF) {
1561		mutex_exit(&ac_flow->ac_lock);
1562		return (ENOTACTIVE);
1563	}
1564	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1565	mutex_exit(&ac_flow->ac_lock);
1566
1567	record_type = EXD_GROUP_FLOW;
1568
1569	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1570	if (flow_usage == NULL) {
1571		return (0);
1572	}
1573
1574	/*
1575	 * Pack object into buffer and pass to callback.
1576	 */
1577	bufsize = ea_pack_object(flow_usage, NULL, 0);
1578	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1579	if (buf == NULL) {
1580		return (ENOMEM);
1581	}
1582
1583	(void) ea_pack_object(flow_usage, buf, bufsize);
1584
1585	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1586
1587	/*
1588	 * Free all previously allocations.
1589	 */
1590	kmem_free(buf, bufsize);
1591	ea_free_object(flow_usage, EUP_ALLOC);
1592	return (ret);
1593}
1594
1595void
1596exacct_commit_flow(void *arg)
1597{
1598	flow_usage_t *f = (flow_usage_t *)arg;
1599	size_t size;
1600	ulong_t mask[AC_MASK_SZ];
1601	struct exacct_globals *acg;
1602	ac_info_t *ac_flow;
1603
1604	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1605		/*
1606		 * acctctl module not loaded. Nothing to do.
1607		 */
1608		return;
1609	}
1610
1611	/*
1612	 * Even though each zone nominally has its own flow accounting settings
1613	 * (ac_flow), these are only maintained by and for the global zone.
1614	 *
1615	 * If this were to change in the future, this function should grow a
1616	 * second zoneid (or zone) argument, and use the corresponding zone's
1617	 * settings rather than always using those of the global zone.
1618	 */
1619	acg = zone_getspecific(exacct_zone_key, global_zone);
1620	ac_flow = &acg->ac_flow;
1621
1622	mutex_enter(&ac_flow->ac_lock);
1623	if (ac_flow->ac_state == AC_OFF) {
1624		mutex_exit(&ac_flow->ac_lock);
1625		return;
1626	}
1627	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1628	mutex_exit(&ac_flow->ac_lock);
1629
1630	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1631	    NULL, 0, &size);
1632}
1633
1634/*
1635 * int exacct_tag_task(task_t *, void *, size_t, int)
1636 *
1637 * Overview
1638 *   exacct_tag_task() provides the exacct record construction and writing
1639 *   support required by putacct(2) for task entities.
1640 *
1641 * Return values
1642 *   The result of the write operation is returned, unless the extended
1643 *   accounting facility is not active, in which case ENOTACTIVE is returned.
1644 *
1645 * Caller's context
1646 *   Suitable for KM_SLEEP allocations.
1647 */
1648int
1649exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1650    int flags)
1651{
1652	int error = 0;
1653	void *buf;
1654	size_t bufsize;
1655	ea_catalog_t cat;
1656	ea_object_t *tag;
1657
1658	mutex_enter(&ac_task->ac_lock);
1659	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1660		mutex_exit(&ac_task->ac_lock);
1661		return (ENOTACTIVE);
1662	}
1663	mutex_exit(&ac_task->ac_lock);
1664
1665	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1666	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1667	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1668	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1669	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1670	if (flags == EP_RAW)
1671		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1672	else
1673		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1674	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1675
1676	bufsize = ea_pack_object(tag, NULL, 0);
1677	buf = kmem_alloc(bufsize, KM_SLEEP);
1678	(void) ea_pack_object(tag, buf, bufsize);
1679	error = exacct_vn_write(ac_task, buf, bufsize);
1680	kmem_free(buf, bufsize);
1681	ea_free_object(tag, EUP_ALLOC);
1682	return (error);
1683}
1684
1685/*
1686 * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1687 *
1688 * Overview
1689 *   exacct_tag_proc() provides the exacct record construction and writing
1690 *   support required by putacct(2) for processes.
1691 *
1692 * Return values
1693 *   The result of the write operation is returned, unless the extended
1694 *   accounting facility is not active, in which case ENOTACTIVE is returned.
1695 *
1696 * Caller's context
1697 *   Suitable for KM_SLEEP allocations.
1698 */
1699int
1700exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1701    size_t ubufsz, int flags, const char *hostname)
1702{
1703	int error = 0;
1704	void *buf;
1705	size_t bufsize;
1706	ea_catalog_t cat;
1707	ea_object_t *tag;
1708
1709	mutex_enter(&ac_proc->ac_lock);
1710	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1711		mutex_exit(&ac_proc->ac_lock);
1712		return (ENOTACTIVE);
1713	}
1714	mutex_exit(&ac_proc->ac_lock);
1715
1716	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1717	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1718	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1719	(void) ea_attach_item(tag, &tkid, 0,
1720	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1721	(void) ea_attach_item(tag, (void *)hostname, 0,
1722	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1723	if (flags == EP_RAW)
1724		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1725	else
1726		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1727	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1728
1729	bufsize = ea_pack_object(tag, NULL, 0);
1730	buf = kmem_alloc(bufsize, KM_SLEEP);
1731	(void) ea_pack_object(tag, buf, bufsize);
1732	error = exacct_vn_write(ac_proc, buf, bufsize);
1733	kmem_free(buf, bufsize);
1734	ea_free_object(tag, EUP_ALLOC);
1735	return (error);
1736}
1737
1738/*
1739 * void exacct_init(void)
1740 *
1741 * Overview
1742 *   Initialized the extended accounting subsystem.
1743 *
1744 * Return values
1745 *   None.
1746 *
1747 * Caller's context
1748 *   Suitable for KM_SLEEP allocations.
1749 */
1750void
1751exacct_init()
1752{
1753	exacct_queue = system_taskq;
1754	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1755	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1756	task_commit_thread_init();
1757}
1758
1759/*
1760 * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
1761 * and resource usage counters into a given task_usage_t. It differs from
1762 * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
1763 * b) p_lock will have been acquired earlier in the call path and c) we
1764 * are here including the process's user and system times.
1765 */
1766static void
1767exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
1768{
1769	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
1770	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
1771	tu->tu_minflt = p->p_ru.minflt;
1772	tu->tu_majflt = p->p_ru.majflt;
1773	tu->tu_sndmsg = p->p_ru.msgsnd;
1774	tu->tu_rcvmsg = p->p_ru.msgrcv;
1775	tu->tu_ioch   = p->p_ru.ioch;
1776	tu->tu_iblk   = p->p_ru.inblock;
1777	tu->tu_oblk   = p->p_ru.oublock;
1778	tu->tu_vcsw   = p->p_ru.nvcsw;
1779	tu->tu_icsw   = p->p_ru.nivcsw;
1780	tu->tu_nsig   = p->p_ru.nsignals;
1781	tu->tu_nswp   = p->p_ru.nswap;
1782	tu->tu_nscl   = p->p_ru.sysc;
1783}
1784
1785/*
1786 * void exacct_move_mstate(proc_t *, task_t *, task_t *)
1787 *
1788 * Overview
1789 *   exacct_move_mstate() is called by task_change() and accounts for
1790 *   a process's resource usage when it is moved from one task to another.
1791 *
1792 *   The process's usage at this point is recorded in the new task so
1793 *   that it can be excluded from the calculation of resources consumed
1794 *   by that task.
1795 *
1796 *   The resource usage inherited by the new task is also added to the
1797 *   aggregate maintained by the old task for processes that have exited.
1798 *
1799 * Return values
1800 *   None.
1801 *
1802 * Caller's context
1803 *   pidlock and p_lock held across exacct_move_mstate().
1804 */
1805void
1806exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
1807{
1808	task_usage_t tu;
1809
1810	/* Take a snapshot of this process's mstate and RU counters */
1811	exacct_snapshot_proc_mstate(p, &tu);
1812
1813	/*
1814	 * Use the snapshot to increment the aggregate usage of the old
1815	 * task, and the inherited usage of the new one.
1816	 */
1817	mutex_enter(&oldtk->tk_usage_lock);
1818	exacct_add_task_mstate(oldtk->tk_usage, &tu);
1819	mutex_exit(&oldtk->tk_usage_lock);
1820	mutex_enter(&newtk->tk_usage_lock);
1821	exacct_add_task_mstate(newtk->tk_inherited, &tu);
1822	mutex_exit(&newtk->tk_usage_lock);
1823}
1824