xref: /illumos-gate/usr/src/uts/common/os/dumpsubr.c (revision 1c802681)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ae115bc7Smrj  * Common Development and Distribution License (the "License").
6ae115bc7Smrj  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
21342440ecSPrasad Singamsetty 
227c478bd9Sstevel@tonic-gate /*
2386f21945SDave Plauger  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24dfec2ecfSJohn Levon  * Copyright 2018 Joyent, Inc.
256ccea422SJoyce McIntosh  * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
267c478bd9Sstevel@tonic-gate  */
277c478bd9Sstevel@tonic-gate 
287c478bd9Sstevel@tonic-gate #include <sys/types.h>
297c478bd9Sstevel@tonic-gate #include <sys/param.h>
307c478bd9Sstevel@tonic-gate #include <sys/systm.h>
317c478bd9Sstevel@tonic-gate #include <sys/vm.h>
327c478bd9Sstevel@tonic-gate #include <sys/proc.h>
337c478bd9Sstevel@tonic-gate #include <sys/file.h>
347c478bd9Sstevel@tonic-gate #include <sys/conf.h>
357c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
367c478bd9Sstevel@tonic-gate #include <sys/mem.h>
377c478bd9Sstevel@tonic-gate #include <sys/mman.h>
387c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
397c478bd9Sstevel@tonic-gate #include <sys/errno.h>
407c478bd9Sstevel@tonic-gate #include <sys/memlist.h>
417c478bd9Sstevel@tonic-gate #include <sys/dumphdr.h>
427c478bd9Sstevel@tonic-gate #include <sys/dumpadm.h>
437c478bd9Sstevel@tonic-gate #include <sys/ksyms.h>
447c478bd9Sstevel@tonic-gate #include <sys/compress.h>
457c478bd9Sstevel@tonic-gate #include <sys/stream.h>
467c478bd9Sstevel@tonic-gate #include <sys/strsun.h>
477c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
487c478bd9Sstevel@tonic-gate #include <sys/bitmap.h>
497c478bd9Sstevel@tonic-gate #include <sys/modctl.h>
507c478bd9Sstevel@tonic-gate #include <sys/utsname.h>
517c478bd9Sstevel@tonic-gate #include <sys/systeminfo.h>
527c478bd9Sstevel@tonic-gate #include <sys/vmem.h>
537c478bd9Sstevel@tonic-gate #include <sys/log.h>
547c478bd9Sstevel@tonic-gate #include <sys/var.h>
557c478bd9Sstevel@tonic-gate #include <sys/debug.h>
567c478bd9Sstevel@tonic-gate #include <sys/sunddi.h>
577c478bd9Sstevel@tonic-gate #include <fs/fs_subr.h>
587c478bd9Sstevel@tonic-gate #include <sys/fs/snode.h>
597c478bd9Sstevel@tonic-gate #include <sys/ontrap.h>
607c478bd9Sstevel@tonic-gate #include <sys/panic.h>
617c478bd9Sstevel@tonic-gate #include <sys/dkio.h>
627c478bd9Sstevel@tonic-gate #include <sys/vtoc.h>
637c478bd9Sstevel@tonic-gate #include <sys/errorq.h>
647c478bd9Sstevel@tonic-gate #include <sys/fm/util.h>
65e7cbe64fSgw #include <sys/fs/zfs.h>
667c478bd9Sstevel@tonic-gate 
677c478bd9Sstevel@tonic-gate #include <vm/hat.h>
687c478bd9Sstevel@tonic-gate #include <vm/as.h>
697c478bd9Sstevel@tonic-gate #include <vm/page.h>
70ca3e8d88SDave Plauger #include <vm/pvn.h>
717c478bd9Sstevel@tonic-gate #include <vm/seg.h>
727c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
73d3d50737SRafael Vanoni #include <sys/clock_impl.h>
74b280b2a3SStuart Maybee #include <sys/hold_page.h>
756ccea422SJoyce McIntosh #include <sys/cpu.h>
767c478bd9Sstevel@tonic-gate 
77ca3e8d88SDave Plauger #include <bzip2/bzlib.h>
78ca3e8d88SDave Plauger 
79dfec2ecfSJohn Levon #define	ONE_GIG	(1024 * 1024 * 1024UL)
80dfec2ecfSJohn Levon 
81ca3e8d88SDave Plauger /*
82ca3e8d88SDave Plauger  * Crash dump time is dominated by disk write time.  To reduce this,
83ca3e8d88SDave Plauger  * the stronger compression method bzip2 is applied to reduce the dump
84ca3e8d88SDave Plauger  * size and hence reduce I/O time.  However, bzip2 is much more
85ca3e8d88SDave Plauger  * computationally expensive than the existing lzjb algorithm, so to
86ca3e8d88SDave Plauger  * avoid increasing compression time, CPUs that are otherwise idle
87ca3e8d88SDave Plauger  * during panic are employed to parallelize the compression task.
88ca3e8d88SDave Plauger  * Many helper CPUs are needed to prevent bzip2 from being a
89ca3e8d88SDave Plauger  * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
90ca3e8d88SDave Plauger  * parallelized instead. Lastly, I/O and compression are performed by
91ca3e8d88SDave Plauger  * different CPUs, and are hence overlapped in time, unlike the older
92ca3e8d88SDave Plauger  * serial code.
93ca3e8d88SDave Plauger  *
94ca3e8d88SDave Plauger  * Another important consideration is the speed of the dump
95ca3e8d88SDave Plauger  * device. Faster disks need less CPUs in order to benefit from
96ca3e8d88SDave Plauger  * parallel lzjb versus parallel bzip2. Therefore, the CPU count
97ca3e8d88SDave Plauger  * threshold for switching from parallel lzjb to paralled bzip2 is
98ca3e8d88SDave Plauger  * elevated for faster disks. The dump device speed is adduced from
99ca3e8d88SDave Plauger  * the setting for dumpbuf.iosize, see dump_update_clevel.
100ca3e8d88SDave Plauger  */
101ca3e8d88SDave Plauger 
102ca3e8d88SDave Plauger /*
103ca3e8d88SDave Plauger  * exported vars
104ca3e8d88SDave Plauger  */
105ca3e8d88SDave Plauger kmutex_t	dump_lock;		/* lock for dump configuration */
106ca3e8d88SDave Plauger dumphdr_t	*dumphdr;		/* dump header */
1077c478bd9Sstevel@tonic-gate int		dump_conflags = DUMP_KERNEL; /* dump configuration flags */
108ca3e8d88SDave Plauger vnode_t		*dumpvp;		/* dump device vnode pointer */
109ca3e8d88SDave Plauger u_offset_t	dumpvp_size;		/* size of dump device, in bytes */
110ca3e8d88SDave Plauger char		*dumppath;		/* pathname of dump device */
111ca3e8d88SDave Plauger int		dump_timeout = 120;	/* timeout for dumping pages */
112ca3e8d88SDave Plauger int		dump_timeleft;		/* portion of dump_timeout remaining */
113ca3e8d88SDave Plauger int		dump_ioerr;		/* dump i/o error */
114ca3e8d88SDave Plauger int		dump_check_used;	/* enable check for used pages */
115f6e214c7SGavin Maltby char	    *dump_stack_scratch; /* scratch area for saving stack summary */
116ca3e8d88SDave Plauger 
117ca3e8d88SDave Plauger /*
118ca3e8d88SDave Plauger  * Tunables for dump compression and parallelism. These can be set via
119ca3e8d88SDave Plauger  * /etc/system.
120ca3e8d88SDave Plauger  *
121ca3e8d88SDave Plauger  * dump_ncpu_low	number of helpers for parallel lzjb
122ca3e8d88SDave Plauger  *	This is also the minimum configuration.
123ca3e8d88SDave Plauger  *
124ca3e8d88SDave Plauger  * dump_bzip2_level	bzip2 compression level: 1-9
125ca3e8d88SDave Plauger  *	Higher numbers give greater compression, but take more memory
126ca3e8d88SDave Plauger  *	and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
127ca3e8d88SDave Plauger  *
128ca3e8d88SDave Plauger  * dump_plat_mincpu	the cross-over limit for using bzip2 (per platform):
129ca3e8d88SDave Plauger  *	if dump_plat_mincpu == 0, then always do single threaded dump
130ca3e8d88SDave Plauger  *	if ncpu >= dump_plat_mincpu then try to use bzip2
131ca3e8d88SDave Plauger  *
132ca3e8d88SDave Plauger  * dump_metrics_on	if set, metrics are collected in the kernel, passed
133ca3e8d88SDave Plauger  *	to savecore via the dump file, and recorded by savecore in
134ca3e8d88SDave Plauger  *	METRICS.txt.
135ca3e8d88SDave Plauger  */
136ca3e8d88SDave Plauger uint_t dump_ncpu_low = 4;	/* minimum config for parallel lzjb */
137ca3e8d88SDave Plauger uint_t dump_bzip2_level = 1;	/* bzip2 level (1-9) */
138ca3e8d88SDave Plauger 
1394cca9c84SDave Plauger /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
1404cca9c84SDave Plauger #define	MINCPU_NOT_SET	((uint_t)-1)
1414cca9c84SDave Plauger uint_t dump_plat_mincpu = MINCPU_NOT_SET;
1424cca9c84SDave Plauger 
1439dd77bc8SDave Plauger /* tunables for pre-reserved heap */
1449dd77bc8SDave Plauger uint_t dump_kmem_permap = 1024;
145dfec2ecfSJohn Levon uint_t dump_kmem_pages = 0;
1469dd77bc8SDave Plauger 
147ca3e8d88SDave Plauger /* Define multiple buffers per helper to avoid stalling */
148ca3e8d88SDave Plauger #define	NCBUF_PER_HELPER	2
149ca3e8d88SDave Plauger #define	NCMAP_PER_HELPER	4
150ca3e8d88SDave Plauger 
151ca3e8d88SDave Plauger /* minimum number of helpers configured */
152ca3e8d88SDave Plauger #define	MINHELPERS	(dump_ncpu_low)
153ca3e8d88SDave Plauger #define	MINCBUFS	(MINHELPERS * NCBUF_PER_HELPER)
154ca3e8d88SDave Plauger 
155ca3e8d88SDave Plauger /*
156ca3e8d88SDave Plauger  * Define constant parameters.
157ca3e8d88SDave Plauger  *
158ca3e8d88SDave Plauger  * CBUF_SIZE		size of an output buffer
159ca3e8d88SDave Plauger  *
160ca3e8d88SDave Plauger  * CBUF_MAPSIZE		size of virtual range for mapping pages
161ca3e8d88SDave Plauger  *
162ca3e8d88SDave Plauger  * CBUF_MAPNP		size of virtual range in pages
163ca3e8d88SDave Plauger  *
164ca3e8d88SDave Plauger  */
165ca3e8d88SDave Plauger #define	DUMP_1KB	((size_t)1 << 10)
166ca3e8d88SDave Plauger #define	DUMP_1MB	((size_t)1 << 20)
167ca3e8d88SDave Plauger #define	CBUF_SIZE	((size_t)1 << 17)
168ca3e8d88SDave Plauger #define	CBUF_MAPSHIFT	(22)
169ca3e8d88SDave Plauger #define	CBUF_MAPSIZE	((size_t)1 << CBUF_MAPSHIFT)
170ca3e8d88SDave Plauger #define	CBUF_MAPNP	((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
171ca3e8d88SDave Plauger 
172ca3e8d88SDave Plauger /*
173ca3e8d88SDave Plauger  * Compression metrics are accumulated nano-second subtotals. The
174ca3e8d88SDave Plauger  * results are normalized by the number of pages dumped. A report is
175ca3e8d88SDave Plauger  * generated when dumpsys() completes and is saved in the dump image
176ca3e8d88SDave Plauger  * after the trailing dump header.
177ca3e8d88SDave Plauger  *
178ca3e8d88SDave Plauger  * Metrics are always collected. Set the variable dump_metrics_on to
179ca3e8d88SDave Plauger  * cause metrics to be saved in the crash file, where savecore will
180ca3e8d88SDave Plauger  * save it in the file METRICS.txt.
181ca3e8d88SDave Plauger  */
182ca3e8d88SDave Plauger #define	PERPAGES \
183ca3e8d88SDave Plauger 	PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
184ca3e8d88SDave Plauger 	PERPAGE(copy) PERPAGE(compress) \
185ca3e8d88SDave Plauger 	PERPAGE(write) \
186ca3e8d88SDave Plauger 	PERPAGE(inwait) PERPAGE(outwait)
187ca3e8d88SDave Plauger 
188ca3e8d88SDave Plauger typedef struct perpage {
189ca3e8d88SDave Plauger #define	PERPAGE(x) hrtime_t x;
190ca3e8d88SDave Plauger 	PERPAGES
191ca3e8d88SDave Plauger #undef PERPAGE
192ca3e8d88SDave Plauger } perpage_t;
193ca3e8d88SDave Plauger 
194ca3e8d88SDave Plauger /*
195ca3e8d88SDave Plauger  * This macro controls the code generation for collecting dump
196ca3e8d88SDave Plauger  * performance information. By default, the code is generated, but
197ca3e8d88SDave Plauger  * automatic saving of the information is disabled. If dump_metrics_on
198ca3e8d88SDave Plauger  * is set to 1, the timing information is passed to savecore via the
199ca3e8d88SDave Plauger  * crash file, where it is appended to the file dump-dir/METRICS.txt.
200ca3e8d88SDave Plauger  */
201ca3e8d88SDave Plauger #define	COLLECT_METRICS
202ca3e8d88SDave Plauger 
203ca3e8d88SDave Plauger #ifdef COLLECT_METRICS
204ca3e8d88SDave Plauger uint_t dump_metrics_on = 0;	/* set to 1 to enable recording metrics */
205ca3e8d88SDave Plauger 
206ca3e8d88SDave Plauger #define	HRSTART(v, m)		v##ts.m = gethrtime()
207ca3e8d88SDave Plauger #define	HRSTOP(v, m)		v.m += gethrtime() - v##ts.m
208ca3e8d88SDave Plauger #define	HRBEGIN(v, m, s)	v##ts.m = gethrtime(); v.size += s
209ca3e8d88SDave Plauger #define	HREND(v, m)		v.m += gethrtime() - v##ts.m
210ca3e8d88SDave Plauger #define	HRNORM(v, m, n)		v.m /= (n)
2117c478bd9Sstevel@tonic-gate 
2127c478bd9Sstevel@tonic-gate #else
213ca3e8d88SDave Plauger #define	HRSTART(v, m)
214ca3e8d88SDave Plauger #define	HRSTOP(v, m)
215ca3e8d88SDave Plauger #define	HRBEGIN(v, m, s)
216ca3e8d88SDave Plauger #define	HREND(v, m)
217ca3e8d88SDave Plauger #define	HRNORM(v, m, n)
218ca3e8d88SDave Plauger #endif	/* COLLECT_METRICS */
219ca3e8d88SDave Plauger 
220ca3e8d88SDave Plauger /*
221ca3e8d88SDave Plauger  * Buffers for copying and compressing memory pages.
222ca3e8d88SDave Plauger  *
223ca3e8d88SDave Plauger  * cbuf_t buffer controllers: used for both input and output.
224ca3e8d88SDave Plauger  *
225ca3e8d88SDave Plauger  * The buffer state indicates how it is being used:
226ca3e8d88SDave Plauger  *
227ca3e8d88SDave Plauger  * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
228ca3e8d88SDave Plauger  * mapping input pages.
229ca3e8d88SDave Plauger  *
230ca3e8d88SDave Plauger  * CBUF_INREADY: input pages are mapped and ready for compression by a
231ca3e8d88SDave Plauger  * helper.
232ca3e8d88SDave Plauger  *
233ca3e8d88SDave Plauger  * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
234ca3e8d88SDave Plauger  *
235ca3e8d88SDave Plauger  * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
236ca3e8d88SDave Plauger  *
237ca3e8d88SDave Plauger  * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
238ca3e8d88SDave Plauger  * ready to write out.
239ca3e8d88SDave Plauger  *
240ca3e8d88SDave Plauger  * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
241ca3e8d88SDave Plauger  * (reports UE errors.)
242ca3e8d88SDave Plauger  */
243ca3e8d88SDave Plauger 
244ca3e8d88SDave Plauger typedef enum cbufstate {
245ca3e8d88SDave Plauger 	CBUF_FREEMAP,
246ca3e8d88SDave Plauger 	CBUF_INREADY,
247ca3e8d88SDave Plauger 	CBUF_USEDMAP,
248ca3e8d88SDave Plauger 	CBUF_FREEBUF,
249ca3e8d88SDave Plauger 	CBUF_WRITE,
250ca3e8d88SDave Plauger 	CBUF_ERRMSG
251ca3e8d88SDave Plauger } cbufstate_t;
252ca3e8d88SDave Plauger 
253ca3e8d88SDave Plauger typedef struct cbuf cbuf_t;
254ca3e8d88SDave Plauger 
255ca3e8d88SDave Plauger struct cbuf {
256ca3e8d88SDave Plauger 	cbuf_t *next;			/* next in list */
257ca3e8d88SDave Plauger 	cbufstate_t state;		/* processing state */
258ca3e8d88SDave Plauger 	size_t used;			/* amount used */
259ca3e8d88SDave Plauger 	size_t size;			/* mem size */
260ca3e8d88SDave Plauger 	char *buf;			/* kmem or vmem */
261ca3e8d88SDave Plauger 	pgcnt_t pagenum;		/* index to pfn map */
262ca3e8d88SDave Plauger 	pgcnt_t bitnum;			/* first set bitnum */
263ca3e8d88SDave Plauger 	pfn_t pfn;			/* first pfn in mapped range */
264ca3e8d88SDave Plauger 	int off;			/* byte offset to first pfn */
265ca3e8d88SDave Plauger };
266ca3e8d88SDave Plauger 
267f6e214c7SGavin Maltby static char dump_osimage_uuid[36 + 1];
268f6e214c7SGavin Maltby 
269f6e214c7SGavin Maltby #define	isdigit(ch)	((ch) >= '0' && (ch) <= '9')
270f6e214c7SGavin Maltby #define	isxdigit(ch)	(isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
271f6e214c7SGavin Maltby 			((ch) >= 'A' && (ch) <= 'F'))
272f6e214c7SGavin Maltby 
273ca3e8d88SDave Plauger /*
274ca3e8d88SDave Plauger  * cqueue_t queues: a uni-directional channel for communication
275ca3e8d88SDave Plauger  * from the master to helper tasks or vice-versa using put and
276ca3e8d88SDave Plauger  * get primitives. Both mappings and data buffers are passed via
277ca3e8d88SDave Plauger  * queues. Producers close a queue when done. The number of
278ca3e8d88SDave Plauger  * active producers is reference counted so the consumer can
279ca3e8d88SDave Plauger  * detect end of data. Concurrent access is mediated by atomic
280ca3e8d88SDave Plauger  * operations for panic dump, or mutex/cv for live dump.
281ca3e8d88SDave Plauger  *
282ca3e8d88SDave Plauger  * There a four queues, used as follows:
283ca3e8d88SDave Plauger  *
284ca3e8d88SDave Plauger  * Queue		Dataflow		NewState
285ca3e8d88SDave Plauger  * --------------------------------------------------
286ca3e8d88SDave Plauger  * mainq		master -> master	FREEMAP
287ca3e8d88SDave Plauger  * master has initialized or unmapped an input buffer
288ca3e8d88SDave Plauger  * --------------------------------------------------
289ca3e8d88SDave Plauger  * helperq		master -> helper	INREADY
290ca3e8d88SDave Plauger  * master has mapped input for use by helper
291ca3e8d88SDave Plauger  * --------------------------------------------------
292ca3e8d88SDave Plauger  * mainq		master <- helper	USEDMAP
293ca3e8d88SDave Plauger  * helper is done with input
294ca3e8d88SDave Plauger  * --------------------------------------------------
295ca3e8d88SDave Plauger  * freebufq		master -> helper	FREEBUF
296ca3e8d88SDave Plauger  * master has initialized or written an output buffer
297ca3e8d88SDave Plauger  * --------------------------------------------------
298ca3e8d88SDave Plauger  * mainq		master <- helper	WRITE
299ca3e8d88SDave Plauger  * block of compressed pages from a helper
300ca3e8d88SDave Plauger  * --------------------------------------------------
301ca3e8d88SDave Plauger  * mainq		master <- helper	ERRMSG
302ca3e8d88SDave Plauger  * error messages from a helper (memory error case)
303ca3e8d88SDave Plauger  * --------------------------------------------------
304ca3e8d88SDave Plauger  * writerq		master <- master	WRITE
305ca3e8d88SDave Plauger  * non-blocking queue of blocks to write
306ca3e8d88SDave Plauger  * --------------------------------------------------
307ca3e8d88SDave Plauger  */
308ca3e8d88SDave Plauger typedef struct cqueue {
309ca3e8d88SDave Plauger 	cbuf_t *volatile first;		/* first in list */
310ca3e8d88SDave Plauger 	cbuf_t *last;			/* last in list */
311ca3e8d88SDave Plauger 	hrtime_t ts;			/* timestamp */
312ca3e8d88SDave Plauger 	hrtime_t empty;			/* total time empty */
313ca3e8d88SDave Plauger 	kmutex_t mutex;			/* live state lock */
314ca3e8d88SDave Plauger 	kcondvar_t cv;			/* live wait var */
315ca3e8d88SDave Plauger 	lock_t spinlock;		/* panic mode spin lock */
316ca3e8d88SDave Plauger 	volatile uint_t open;		/* producer ref count */
317ca3e8d88SDave Plauger } cqueue_t;
318ca3e8d88SDave Plauger 
319ca3e8d88SDave Plauger /*
320ca3e8d88SDave Plauger  * Convenience macros for using the cqueue functions
321ca3e8d88SDave Plauger  * Note that the caller must have defined "dumpsync_t *ds"
322ca3e8d88SDave Plauger  */
323ca3e8d88SDave Plauger #define	CQ_IS_EMPTY(q)					\
324ca3e8d88SDave Plauger 	(ds->q.first == NULL)
325ca3e8d88SDave Plauger 
326ca3e8d88SDave Plauger #define	CQ_OPEN(q)					\
327ca3e8d88SDave Plauger 	atomic_inc_uint(&ds->q.open)
328ca3e8d88SDave Plauger 
329ca3e8d88SDave Plauger #define	CQ_CLOSE(q)					\
330ca3e8d88SDave Plauger 	dumpsys_close_cq(&ds->q, ds->live)
331ca3e8d88SDave Plauger 
332ca3e8d88SDave Plauger #define	CQ_PUT(q, cp, st)				\
333ca3e8d88SDave Plauger 	dumpsys_put_cq(&ds->q, cp, st, ds->live)
334ca3e8d88SDave Plauger 
335ca3e8d88SDave Plauger #define	CQ_GET(q)					\
336ca3e8d88SDave Plauger 	dumpsys_get_cq(&ds->q, ds->live)
337ca3e8d88SDave Plauger 
338ca3e8d88SDave Plauger /*
339ca3e8d88SDave Plauger  * Dynamic state when dumpsys() is running.
340ca3e8d88SDave Plauger  */
341ca3e8d88SDave Plauger typedef struct dumpsync {
342ca3e8d88SDave Plauger 	pgcnt_t npages;			/* subtotal of pages dumped */
343ca3e8d88SDave Plauger 	pgcnt_t pages_mapped;		/* subtotal of pages mapped */
344ca3e8d88SDave Plauger 	pgcnt_t pages_used;		/* subtotal of pages used per map */
345ca3e8d88SDave Plauger 	size_t nwrite;			/* subtotal of bytes written */
346ca3e8d88SDave Plauger 	uint_t live;			/* running live dump */
347ca3e8d88SDave Plauger 	uint_t neednl;			/* will need to print a newline */
348ca3e8d88SDave Plauger 	uint_t percent;			/* dump progress */
349ca3e8d88SDave Plauger 	uint_t percent_done;		/* dump progress reported */
3502fb0949cSJoshua M. Clulow 	int sec_done;			/* dump progress last report time */
351ca3e8d88SDave Plauger 	cqueue_t freebufq;		/* free kmem bufs for writing */
352ca3e8d88SDave Plauger 	cqueue_t mainq;			/* input for main task */
353ca3e8d88SDave Plauger 	cqueue_t helperq;		/* input for helpers */
354ca3e8d88SDave Plauger 	cqueue_t writerq;		/* input for writer */
355ca3e8d88SDave Plauger 	hrtime_t start;			/* start time */
356ca3e8d88SDave Plauger 	hrtime_t elapsed;		/* elapsed time when completed */
357ca3e8d88SDave Plauger 	hrtime_t iotime;		/* time spent writing nwrite bytes */
358ca3e8d88SDave Plauger 	hrtime_t iowait;		/* time spent waiting for output */
359ca3e8d88SDave Plauger 	hrtime_t iowaitts;		/* iowait timestamp */
360ca3e8d88SDave Plauger 	perpage_t perpage;		/* metrics */
361ca3e8d88SDave Plauger 	perpage_t perpagets;
362ca3e8d88SDave Plauger 	int dumpcpu;			/* master cpu */
363ca3e8d88SDave Plauger } dumpsync_t;
364ca3e8d88SDave Plauger 
365ca3e8d88SDave Plauger static dumpsync_t dumpsync;		/* synchronization vars */
366ca3e8d88SDave Plauger 
367ca3e8d88SDave Plauger /*
368ca3e8d88SDave Plauger  * helper_t helpers: contains the context for a stream. CPUs run in
369ca3e8d88SDave Plauger  * parallel at dump time; each CPU creates a single stream of
370ca3e8d88SDave Plauger  * compression data.  Stream data is divided into CBUF_SIZE blocks.
371ca3e8d88SDave Plauger  * The blocks are written in order within a stream. But, blocks from
372ca3e8d88SDave Plauger  * multiple streams can be interleaved. Each stream is identified by a
373ca3e8d88SDave Plauger  * unique tag.
374ca3e8d88SDave Plauger  */
375ca3e8d88SDave Plauger typedef struct helper {
376ca3e8d88SDave Plauger 	int helper;			/* bound helper id */
377ca3e8d88SDave Plauger 	int tag;			/* compression stream tag */
378ca3e8d88SDave Plauger 	perpage_t perpage;		/* per page metrics */
379ca3e8d88SDave Plauger 	perpage_t perpagets;		/* per page metrics (timestamps) */
380ca3e8d88SDave Plauger 	taskqid_t taskqid;		/* live dump task ptr */
381ca3e8d88SDave Plauger 	int in, out;			/* buffer offsets */
382ca3e8d88SDave Plauger 	cbuf_t *cpin, *cpout, *cperr;	/* cbuf objects in process */
383ca3e8d88SDave Plauger 	dumpsync_t *ds;			/* pointer to sync vars */
384ca3e8d88SDave Plauger 	size_t used;			/* counts input consumed */
385ca3e8d88SDave Plauger 	char *page;			/* buffer for page copy */
386ca3e8d88SDave Plauger 	char *lzbuf;			/* lzjb output */
387ca3e8d88SDave Plauger 	bz_stream bzstream;		/* bzip2 state */
388ca3e8d88SDave Plauger } helper_t;
389ca3e8d88SDave Plauger 
390ca3e8d88SDave Plauger #define	MAINHELPER	(-1)		/* helper is also the main task */
391ca3e8d88SDave Plauger #define	FREEHELPER	(-2)		/* unbound helper */
392ca3e8d88SDave Plauger #define	DONEHELPER	(-3)		/* helper finished */
3937c478bd9Sstevel@tonic-gate 
394ca3e8d88SDave Plauger /*
395ca3e8d88SDave Plauger  * configuration vars for dumpsys
396ca3e8d88SDave Plauger  */
397ca3e8d88SDave Plauger typedef struct dumpcfg {
398ca3e8d88SDave Plauger 	int	threshold;	/* ncpu threshold for bzip2 */
399ca3e8d88SDave Plauger 	int	nhelper;	/* number of helpers */
400ca3e8d88SDave Plauger 	int	nhelper_used;	/* actual number of helpers used */
401ca3e8d88SDave Plauger 	int	ncmap;		/* number VA pages for compression */
402ca3e8d88SDave Plauger 	int	ncbuf;		/* number of bufs for compression */
403ca3e8d88SDave Plauger 	int	ncbuf_used;	/* number of bufs in use */
404ca3e8d88SDave Plauger 	uint_t	clevel;		/* dump compression level */
405ca3e8d88SDave Plauger 	helper_t *helper;	/* array of helpers */
406ca3e8d88SDave Plauger 	cbuf_t	*cmap;		/* array of input (map) buffers */
407ca3e8d88SDave Plauger 	cbuf_t	*cbuf;		/* array of output  buffers */
408ca3e8d88SDave Plauger 	ulong_t	*helpermap;	/* set of dumpsys helper CPU ids */
409ca3e8d88SDave Plauger 	ulong_t	*bitmap;	/* bitmap for marking pages to dump */
410ca3e8d88SDave Plauger 	ulong_t	*rbitmap;	/* bitmap for used CBUF_MAPSIZE ranges */
411ca3e8d88SDave Plauger 	pgcnt_t	bitmapsize;	/* size of bitmap */
412ca3e8d88SDave Plauger 	pgcnt_t	rbitmapsize;	/* size of bitmap for ranges */
413ca3e8d88SDave Plauger 	pgcnt_t found4m;	/* number ranges allocated by dump */
414ca3e8d88SDave Plauger 	pgcnt_t foundsm;	/* number small pages allocated by dump */
415ca3e8d88SDave Plauger 	pid_t	*pids;		/* list of process IDs at dump time */
416ca3e8d88SDave Plauger 	size_t	maxsize;	/* memory size needed at dump time */
417ca3e8d88SDave Plauger 	size_t	maxvmsize;	/* size of reserved VM */
418ca3e8d88SDave Plauger 	char	*maxvm;		/* reserved VM for spare pages */
419ca3e8d88SDave Plauger 	lock_t	helper_lock;	/* protect helper state */
420ca3e8d88SDave Plauger 	char	helpers_wanted;	/* flag to enable parallelism */
421ca3e8d88SDave Plauger } dumpcfg_t;
422ca3e8d88SDave Plauger 
423ca3e8d88SDave Plauger static dumpcfg_t dumpcfg;	/* config vars */
424ca3e8d88SDave Plauger 
425ca3e8d88SDave Plauger /*
426ca3e8d88SDave Plauger  * The dump I/O buffer.
427ca3e8d88SDave Plauger  *
428ca3e8d88SDave Plauger  * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
429ca3e8d88SDave Plauger  * sized according to the optimum device transfer speed.
430ca3e8d88SDave Plauger  */
431ca3e8d88SDave Plauger typedef struct dumpbuf {
432ca3e8d88SDave Plauger 	vnode_t	*cdev_vp;	/* VCHR open of the dump device */
433ca3e8d88SDave Plauger 	len_t	vp_limit;	/* maximum write offset */
434ca3e8d88SDave Plauger 	offset_t vp_off;	/* current dump device offset */
435ca3e8d88SDave Plauger 	char	*cur;		/* dump write pointer */
436ca3e8d88SDave Plauger 	char	*start;		/* dump buffer address */
437ca3e8d88SDave Plauger 	char	*end;		/* dump buffer end */
438ca3e8d88SDave Plauger 	size_t	size;		/* size of dumpbuf in bytes */
439ca3e8d88SDave Plauger 	size_t	iosize;		/* best transfer size for device */
440ca3e8d88SDave Plauger } dumpbuf_t;
441ca3e8d88SDave Plauger 
442ca3e8d88SDave Plauger dumpbuf_t dumpbuf;		/* I/O buffer */
443ca3e8d88SDave Plauger 
4446ccea422SJoyce McIntosh /*
4456ccea422SJoyce McIntosh  * For parallel dump, defines maximum time main task thread will wait
4466ccea422SJoyce McIntosh  * for at least one helper to register in dumpcfg.helpermap, before
4476ccea422SJoyce McIntosh  * assuming there are no helpers and falling back to serial mode.
4486ccea422SJoyce McIntosh  * Value is chosen arbitrary and provides *really* long wait for any
4496ccea422SJoyce McIntosh  * available helper to register.
4506ccea422SJoyce McIntosh  */
4516ccea422SJoyce McIntosh #define	DUMP_HELPER_MAX_WAIT	1000	/* millisec */
4526ccea422SJoyce McIntosh 
453ca3e8d88SDave Plauger /*
454ca3e8d88SDave Plauger  * The dump I/O buffer must be at least one page, at most xfer_size
455ca3e8d88SDave Plauger  * bytes, and should scale with physmem in between.  The transfer size
456ca3e8d88SDave Plauger  * passed in will either represent a global default (maxphys) or the
457ca3e8d88SDave Plauger  * best size for the device.  The size of the dumpbuf I/O buffer is
458ca3e8d88SDave Plauger  * limited by dumpbuf_limit (8MB by default) because the dump
459ca3e8d88SDave Plauger  * performance saturates beyond a certain size.  The default is to
460ca3e8d88SDave Plauger  * select 1/4096 of the memory.
4617c478bd9Sstevel@tonic-gate  */
462ca3e8d88SDave Plauger static int	dumpbuf_fraction = 12;	/* memory size scale factor */
463ca3e8d88SDave Plauger static size_t	dumpbuf_limit = 8 * DUMP_1MB;	/* max I/O buf size */
464ca3e8d88SDave Plauger 
4657c478bd9Sstevel@tonic-gate static size_t
dumpbuf_iosize(size_t xfer_size)4667c478bd9Sstevel@tonic-gate dumpbuf_iosize(size_t xfer_size)
4677c478bd9Sstevel@tonic-gate {
468ca3e8d88SDave Plauger 	size_t iosize = ptob(physmem >> dumpbuf_fraction);
469ca3e8d88SDave Plauger 
470ca3e8d88SDave Plauger 	if (iosize < PAGESIZE)
471ca3e8d88SDave Plauger 		iosize = PAGESIZE;
472ca3e8d88SDave Plauger 	else if (iosize > xfer_size)
473ca3e8d88SDave Plauger 		iosize = xfer_size;
474ca3e8d88SDave Plauger 	if (iosize > dumpbuf_limit)
475ca3e8d88SDave Plauger 		iosize = dumpbuf_limit;
476ca3e8d88SDave Plauger 	return (iosize & PAGEMASK);
4777c478bd9Sstevel@tonic-gate }
4787c478bd9Sstevel@tonic-gate 
479ca3e8d88SDave Plauger /*
480ca3e8d88SDave Plauger  * resize the I/O buffer
481ca3e8d88SDave Plauger  */
4827c478bd9Sstevel@tonic-gate static void
dumpbuf_resize(void)4837c478bd9Sstevel@tonic-gate dumpbuf_resize(void)
4847c478bd9Sstevel@tonic-gate {
485ca3e8d88SDave Plauger 	char *old_buf = dumpbuf.start;
486ca3e8d88SDave Plauger 	size_t old_size = dumpbuf.size;
4877c478bd9Sstevel@tonic-gate 	char *new_buf;
4887c478bd9Sstevel@tonic-gate 	size_t new_size;
4897c478bd9Sstevel@tonic-gate 
4907c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&dump_lock));
4917c478bd9Sstevel@tonic-gate 
492ca3e8d88SDave Plauger 	new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
493ca3e8d88SDave Plauger 	if (new_size <= old_size)
4947c478bd9Sstevel@tonic-gate 		return; /* no need to reallocate buffer */
4957c478bd9Sstevel@tonic-gate 
4967c478bd9Sstevel@tonic-gate 	new_buf = kmem_alloc(new_size, KM_SLEEP);
497ca3e8d88SDave Plauger 	dumpbuf.size = new_size;
498ca3e8d88SDave Plauger 	dumpbuf.start = new_buf;
499ca3e8d88SDave Plauger 	dumpbuf.end = new_buf + new_size;
5007c478bd9Sstevel@tonic-gate 	kmem_free(old_buf, old_size);
5017c478bd9Sstevel@tonic-gate }
5027c478bd9Sstevel@tonic-gate 
503ca3e8d88SDave Plauger /*
504ca3e8d88SDave Plauger  * dump_update_clevel is called when dumpadm configures the dump device.
5056ccea422SJoyce McIntosh  *	Calculate number of helpers and buffers.
5066ccea422SJoyce McIntosh  *	Allocate the minimum configuration for now.
507ca3e8d88SDave Plauger  *
508ca3e8d88SDave Plauger  * When the dump file is configured we reserve a minimum amount of
509ca3e8d88SDave Plauger  * memory for use at crash time. But we reserve VA for all the memory
510ca3e8d88SDave Plauger  * we really want in order to do the fastest dump possible. The VA is
511ca3e8d88SDave Plauger  * backed by pages not being dumped, according to the bitmap. If
512ca3e8d88SDave Plauger  * there is insufficient spare memory, however, we fall back to the
513ca3e8d88SDave Plauger  * minimum.
514ca3e8d88SDave Plauger  *
515ca3e8d88SDave Plauger  * Live dump (savecore -L) always uses the minimum config.
516ca3e8d88SDave Plauger  *
517ca3e8d88SDave Plauger  * clevel 0 is single threaded lzjb
518ca3e8d88SDave Plauger  * clevel 1 is parallel lzjb
519ca3e8d88SDave Plauger  * clevel 2 is parallel bzip2
520ca3e8d88SDave Plauger  *
521ca3e8d88SDave Plauger  * The ncpu threshold is selected with dump_plat_mincpu.
522ca3e8d88SDave Plauger  * On OPL, set_platform_defaults() overrides the sun4u setting.
523ca3e8d88SDave Plauger  * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
524ca3e8d88SDave Plauger  *
525ca3e8d88SDave Plauger  * Architecture		Threshold	Algorithm
5266ccea422SJoyce McIntosh  * sun4u		<  51		parallel lzjb
5276ccea422SJoyce McIntosh  * sun4u		>= 51		parallel bzip2(*)
5286ccea422SJoyce McIntosh  * sun4u OPL		<  8		parallel lzjb
5296ccea422SJoyce McIntosh  * sun4u OPL		>= 8		parallel bzip2(*)
5306ccea422SJoyce McIntosh  * sun4v		<  128		parallel lzjb
5316ccea422SJoyce McIntosh  * sun4v		>= 128		parallel bzip2(*)
532ca3e8d88SDave Plauger  * x86			< 11		parallel lzjb
533ca3e8d88SDave Plauger  * x86			>= 11		parallel bzip2(*)
5346ccea422SJoyce McIntosh  * 32-bit		N/A		single-threaded lzjb
535ca3e8d88SDave Plauger  *
536ca3e8d88SDave Plauger  * (*) bzip2 is only chosen if there is sufficient available
537ca3e8d88SDave Plauger  * memory for buffers at dump time. See dumpsys_get_maxmem().
538ca3e8d88SDave Plauger  *
539ca3e8d88SDave Plauger  * Faster dump devices have larger I/O buffers. The threshold value is
540ca3e8d88SDave Plauger  * increased according to the size of the dump I/O buffer, because
541ca3e8d88SDave Plauger  * parallel lzjb performs better with faster disks. For buffers >= 1MB
542ca3e8d88SDave Plauger  * the threshold is 3X; for buffers >= 256K threshold is 2X.
543ca3e8d88SDave Plauger  *
544ca3e8d88SDave Plauger  * For parallel dumps, the number of helpers is ncpu-1. The CPU
545ca3e8d88SDave Plauger  * running panic runs the main task. For single-threaded dumps, the
546ca3e8d88SDave Plauger  * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
547ca3e8d88SDave Plauger  *
548ca3e8d88SDave Plauger  * Need multiple buffers per helper so that they do not block waiting
549ca3e8d88SDave Plauger  * for the main task.
550ca3e8d88SDave Plauger  *				parallel	single-threaded
551ca3e8d88SDave Plauger  * Number of output buffers:	nhelper*2		1
552ca3e8d88SDave Plauger  * Number of mapping buffers:	nhelper*4		1
553ca3e8d88SDave Plauger  *
554ca3e8d88SDave Plauger  */
555ca3e8d88SDave Plauger static void
dump_update_clevel()556ca3e8d88SDave Plauger dump_update_clevel()
557ca3e8d88SDave Plauger {
558ca3e8d88SDave Plauger 	int tag;
559ca3e8d88SDave Plauger 	size_t bz2size;
560ca3e8d88SDave Plauger 	helper_t *hp, *hpend;
561ca3e8d88SDave Plauger 	cbuf_t *cp, *cpend;
562ca3e8d88SDave Plauger 	dumpcfg_t *old = &dumpcfg;
563ca3e8d88SDave Plauger 	dumpcfg_t newcfg = *old;
564ca3e8d88SDave Plauger 	dumpcfg_t *new = &newcfg;
565ca3e8d88SDave Plauger 
566ca3e8d88SDave Plauger 	ASSERT(MUTEX_HELD(&dump_lock));
567ca3e8d88SDave Plauger 
568ca3e8d88SDave Plauger 	/*
569ca3e8d88SDave Plauger 	 * Free the previously allocated bufs and VM.
570ca3e8d88SDave Plauger 	 */
571ca3e8d88SDave Plauger 	if (old->helper != NULL) {
572ca3e8d88SDave Plauger 
573ca3e8d88SDave Plauger 		/* helpers */
574ca3e8d88SDave Plauger 		hpend = &old->helper[old->nhelper];
575ca3e8d88SDave Plauger 		for (hp = old->helper; hp != hpend; hp++) {
576ca3e8d88SDave Plauger 			if (hp->lzbuf != NULL)
577ca3e8d88SDave Plauger 				kmem_free(hp->lzbuf, PAGESIZE);
578ca3e8d88SDave Plauger 			if (hp->page != NULL)
579ca3e8d88SDave Plauger 				kmem_free(hp->page, PAGESIZE);
580ca3e8d88SDave Plauger 		}
581ca3e8d88SDave Plauger 		kmem_free(old->helper, old->nhelper * sizeof (helper_t));
582ca3e8d88SDave Plauger 
583ca3e8d88SDave Plauger 		/* VM space for mapping pages */
584ca3e8d88SDave Plauger 		cpend = &old->cmap[old->ncmap];
585ca3e8d88SDave Plauger 		for (cp = old->cmap; cp != cpend; cp++)
586ca3e8d88SDave Plauger 			vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
587ca3e8d88SDave Plauger 		kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
588ca3e8d88SDave Plauger 
589ca3e8d88SDave Plauger 		/* output bufs */
590ca3e8d88SDave Plauger 		cpend = &old->cbuf[old->ncbuf];
591ca3e8d88SDave Plauger 		for (cp = old->cbuf; cp != cpend; cp++)
592ca3e8d88SDave Plauger 			if (cp->buf != NULL)
593ca3e8d88SDave Plauger 				kmem_free(cp->buf, cp->size);
594ca3e8d88SDave Plauger 		kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
595ca3e8d88SDave Plauger 
596ca3e8d88SDave Plauger 		/* reserved VM for dumpsys_get_maxmem */
597ca3e8d88SDave Plauger 		if (old->maxvmsize > 0)
598ca3e8d88SDave Plauger 			vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
599ca3e8d88SDave Plauger 	}
600ca3e8d88SDave Plauger 
601ca3e8d88SDave Plauger 	/*
602ca3e8d88SDave Plauger 	 * Allocate memory and VM.
603ca3e8d88SDave Plauger 	 * One CPU runs dumpsys, the rest are helpers.
604ca3e8d88SDave Plauger 	 */
605ca3e8d88SDave Plauger 	new->nhelper = ncpus - 1;
606ca3e8d88SDave Plauger 	if (new->nhelper < 1)
607ca3e8d88SDave Plauger 		new->nhelper = 1;
608ca3e8d88SDave Plauger 
609ca3e8d88SDave Plauger 	if (new->nhelper > DUMP_MAX_NHELPER)
610ca3e8d88SDave Plauger 		new->nhelper = DUMP_MAX_NHELPER;
611ca3e8d88SDave Plauger 
6124cca9c84SDave Plauger 	/* use platform default, unless /etc/system overrides */
6134cca9c84SDave Plauger 	if (dump_plat_mincpu == MINCPU_NOT_SET)
6144cca9c84SDave Plauger 		dump_plat_mincpu = dump_plat_mincpu_default;
6154cca9c84SDave Plauger 
616ca3e8d88SDave Plauger 	/* increase threshold for faster disks */
617ca3e8d88SDave Plauger 	new->threshold = dump_plat_mincpu;
618ca3e8d88SDave Plauger 	if (dumpbuf.iosize >= DUMP_1MB)
619ca3e8d88SDave Plauger 		new->threshold *= 3;
620ca3e8d88SDave Plauger 	else if (dumpbuf.iosize >= (256 * DUMP_1KB))
621ca3e8d88SDave Plauger 		new->threshold *= 2;
622ca3e8d88SDave Plauger 
623ca3e8d88SDave Plauger 	/* figure compression level based upon the computed threshold. */
624ca3e8d88SDave Plauger 	if (dump_plat_mincpu == 0 || new->nhelper < 2) {
625ca3e8d88SDave Plauger 		new->clevel = 0;
626ca3e8d88SDave Plauger 		new->nhelper = 1;
627ca3e8d88SDave Plauger 	} else if ((new->nhelper + 1) >= new->threshold) {
628ca3e8d88SDave Plauger 		new->clevel = DUMP_CLEVEL_BZIP2;
629ca3e8d88SDave Plauger 	} else {
630ca3e8d88SDave Plauger 		new->clevel = DUMP_CLEVEL_LZJB;
631ca3e8d88SDave Plauger 	}
632ca3e8d88SDave Plauger 
633ca3e8d88SDave Plauger 	if (new->clevel == 0) {
634ca3e8d88SDave Plauger 		new->ncbuf = 1;
635ca3e8d88SDave Plauger 		new->ncmap = 1;
636ca3e8d88SDave Plauger 	} else {
637ca3e8d88SDave Plauger 		new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
638ca3e8d88SDave Plauger 		new->ncmap = NCMAP_PER_HELPER * new->nhelper;
639ca3e8d88SDave Plauger 	}
640ca3e8d88SDave Plauger 
641ca3e8d88SDave Plauger 	/*
642ca3e8d88SDave Plauger 	 * Allocate new data structures and buffers for MINHELPERS,
643ca3e8d88SDave Plauger 	 * and also figure the max desired size.
644ca3e8d88SDave Plauger 	 */
645ca3e8d88SDave Plauger 	bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
646ca3e8d88SDave Plauger 	new->maxsize = 0;
647ca3e8d88SDave Plauger 	new->maxvmsize = 0;
648ca3e8d88SDave Plauger 	new->maxvm = NULL;
649ca3e8d88SDave Plauger 	tag = 1;
650ca3e8d88SDave Plauger 	new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
651ca3e8d88SDave Plauger 	hpend = &new->helper[new->nhelper];
652ca3e8d88SDave Plauger 	for (hp = new->helper; hp != hpend; hp++) {
653ca3e8d88SDave Plauger 		hp->tag = tag++;
654ca3e8d88SDave Plauger 		if (hp < &new->helper[MINHELPERS]) {
655ca3e8d88SDave Plauger 			hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
656ca3e8d88SDave Plauger 			hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
657ca3e8d88SDave Plauger 		} else if (new->clevel < DUMP_CLEVEL_BZIP2) {
658ca3e8d88SDave Plauger 			new->maxsize += 2 * PAGESIZE;
659ca3e8d88SDave Plauger 		} else {
660ca3e8d88SDave Plauger 			new->maxsize += PAGESIZE;
661ca3e8d88SDave Plauger 		}
662ca3e8d88SDave Plauger 		if (new->clevel >= DUMP_CLEVEL_BZIP2)
663ca3e8d88SDave Plauger 			new->maxsize += bz2size;
664ca3e8d88SDave Plauger 	}
665ca3e8d88SDave Plauger 
666ca3e8d88SDave Plauger 	new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
667ca3e8d88SDave Plauger 	cpend = &new->cbuf[new->ncbuf];
668ca3e8d88SDave Plauger 	for (cp = new->cbuf; cp != cpend; cp++) {
669ca3e8d88SDave Plauger 		cp->state = CBUF_FREEBUF;
670ca3e8d88SDave Plauger 		cp->size = CBUF_SIZE;
671ca3e8d88SDave Plauger 		if (cp < &new->cbuf[MINCBUFS])
672ca3e8d88SDave Plauger 			cp->buf = kmem_alloc(cp->size, KM_SLEEP);
673ca3e8d88SDave Plauger 		else
674ca3e8d88SDave Plauger 			new->maxsize += cp->size;
675ca3e8d88SDave Plauger 	}
676ca3e8d88SDave Plauger 
677ca3e8d88SDave Plauger 	new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
678ca3e8d88SDave Plauger 	cpend = &new->cmap[new->ncmap];
679ca3e8d88SDave Plauger 	for (cp = new->cmap; cp != cpend; cp++) {
680ca3e8d88SDave Plauger 		cp->state = CBUF_FREEMAP;
681ca3e8d88SDave Plauger 		cp->size = CBUF_MAPSIZE;
682ca3e8d88SDave Plauger 		cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
683ca3e8d88SDave Plauger 		    0, 0, NULL, NULL, VM_SLEEP);
684ca3e8d88SDave Plauger 	}
685ca3e8d88SDave Plauger 
686ca3e8d88SDave Plauger 	/* reserve VA to be backed with spare pages at crash time */
687ca3e8d88SDave Plauger 	if (new->maxsize > 0) {
688ca3e8d88SDave Plauger 		new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
689ca3e8d88SDave Plauger 		new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
690ca3e8d88SDave Plauger 		new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
691ca3e8d88SDave Plauger 		    CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
692ca3e8d88SDave Plauger 	}
693ca3e8d88SDave Plauger 
6949dd77bc8SDave Plauger 	/*
695dfec2ecfSJohn Levon 	 * Reserve memory for kmem allocation calls made during crash dump.  The
696dfec2ecfSJohn Levon 	 * hat layer allocates memory for each mapping created, and the I/O path
697dfec2ecfSJohn Levon 	 * allocates buffers and data structs.
698dfec2ecfSJohn Levon 	 *
699dfec2ecfSJohn Levon 	 * On larger systems, we easily exceed the lower amount, so we need some
700dfec2ecfSJohn Levon 	 * more space; the cut-over point is relatively arbitrary.  If we run
701dfec2ecfSJohn Levon 	 * out, the only impact is that kmem state in the dump becomes
702dfec2ecfSJohn Levon 	 * inconsistent.
7039dd77bc8SDave Plauger 	 */
704dfec2ecfSJohn Levon 
705dfec2ecfSJohn Levon 	if (dump_kmem_pages == 0) {
706dfec2ecfSJohn Levon 		if (physmem > (16 * ONE_GIG) / PAGESIZE)
707dfec2ecfSJohn Levon 			dump_kmem_pages = 20;
708dfec2ecfSJohn Levon 		else
709dfec2ecfSJohn Levon 			dump_kmem_pages = 8;
710dfec2ecfSJohn Levon 	}
711dfec2ecfSJohn Levon 
7129dd77bc8SDave Plauger 	kmem_dump_init((new->ncmap * dump_kmem_permap) +
7139dd77bc8SDave Plauger 	    (dump_kmem_pages * PAGESIZE));
7149dd77bc8SDave Plauger 
715ca3e8d88SDave Plauger 	/* set new config pointers */
716ca3e8d88SDave Plauger 	*old = *new;
717ca3e8d88SDave Plauger }
718ca3e8d88SDave Plauger 
719ca3e8d88SDave Plauger /*
720ca3e8d88SDave Plauger  * Define a struct memlist walker to optimize bitnum to pfn
721ca3e8d88SDave Plauger  * lookup. The walker maintains the state of the list traversal.
722ca3e8d88SDave Plauger  */
723ca3e8d88SDave Plauger typedef struct dumpmlw {
724ca3e8d88SDave Plauger 	struct memlist	*mp;		/* current memlist */
725ca3e8d88SDave Plauger 	pgcnt_t		basenum;	/* bitnum base offset */
726ca3e8d88SDave Plauger 	pgcnt_t		mppages;	/* current memlist size */
727ca3e8d88SDave Plauger 	pgcnt_t		mpleft;		/* size to end of current memlist */
728ca3e8d88SDave Plauger 	pfn_t		mpaddr;		/* first pfn in memlist */
729ca3e8d88SDave Plauger } dumpmlw_t;
730ca3e8d88SDave Plauger 
731ca3e8d88SDave Plauger /* initialize the walker */
732ca3e8d88SDave Plauger static inline void
dump_init_memlist_walker(dumpmlw_t * pw)733ca3e8d88SDave Plauger dump_init_memlist_walker(dumpmlw_t *pw)
734ca3e8d88SDave Plauger {
735ca3e8d88SDave Plauger 	pw->mp = phys_install;
736ca3e8d88SDave Plauger 	pw->basenum = 0;
73756f33205SJonathan Adams 	pw->mppages = pw->mp->ml_size >> PAGESHIFT;
738ca3e8d88SDave Plauger 	pw->mpleft = pw->mppages;
73956f33205SJonathan Adams 	pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
740ca3e8d88SDave Plauger }
741ca3e8d88SDave Plauger 
742ca3e8d88SDave Plauger /*
743ca3e8d88SDave Plauger  * Lookup pfn given bitnum. The memlist can be quite long on some
744ca3e8d88SDave Plauger  * systems (e.g.: one per board). To optimize sequential lookups, the
745ca3e8d88SDave Plauger  * caller initializes and presents a memlist walker.
746ca3e8d88SDave Plauger  */
747ca3e8d88SDave Plauger static pfn_t
dump_bitnum_to_pfn(pgcnt_t bitnum,dumpmlw_t * pw)748ca3e8d88SDave Plauger dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
749ca3e8d88SDave Plauger {
750ca3e8d88SDave Plauger 	bitnum -= pw->basenum;
751ca3e8d88SDave Plauger 	while (pw->mp != NULL) {
752ca3e8d88SDave Plauger 		if (bitnum < pw->mppages) {
753ca3e8d88SDave Plauger 			pw->mpleft = pw->mppages - bitnum;
754ca3e8d88SDave Plauger 			return (pw->mpaddr + bitnum);
755ca3e8d88SDave Plauger 		}
756ca3e8d88SDave Plauger 		bitnum -= pw->mppages;
757ca3e8d88SDave Plauger 		pw->basenum += pw->mppages;
75856f33205SJonathan Adams 		pw->mp = pw->mp->ml_next;
759ca3e8d88SDave Plauger 		if (pw->mp != NULL) {
76056f33205SJonathan Adams 			pw->mppages = pw->mp->ml_size >> PAGESHIFT;
761ca3e8d88SDave Plauger 			pw->mpleft = pw->mppages;
76256f33205SJonathan Adams 			pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
763ca3e8d88SDave Plauger 		}
764ca3e8d88SDave Plauger 	}
765ca3e8d88SDave Plauger 	return (PFN_INVALID);
766ca3e8d88SDave Plauger }
767ca3e8d88SDave Plauger 
768ca3e8d88SDave Plauger static pgcnt_t
dump_pfn_to_bitnum(pfn_t pfn)769ca3e8d88SDave Plauger dump_pfn_to_bitnum(pfn_t pfn)
770ca3e8d88SDave Plauger {
771ca3e8d88SDave Plauger 	struct memlist *mp;
772ca3e8d88SDave Plauger 	pgcnt_t bitnum = 0;
773ca3e8d88SDave Plauger 
77456f33205SJonathan Adams 	for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
77556f33205SJonathan Adams 		if (pfn >= (mp->ml_address >> PAGESHIFT) &&
77656f33205SJonathan Adams 		    pfn < ((mp->ml_address +