27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
5ae115bcmrj * Common Development and Distribution License (the "License").
6ae115bcmrj * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
21342440ePrasad Singamsetty
2386f2194Dave Plauger * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24dfec2ecJohn Levon * Copyright 2018 Joyent, Inc.
256ccea42Joyce McIntosh * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
267c478bdstevel@tonic-gate */
287c478bdstevel@tonic-gate#include <sys/types.h>
297c478bdstevel@tonic-gate#include <sys/param.h>
307c478bdstevel@tonic-gate#include <sys/systm.h>
317c478bdstevel@tonic-gate#include <sys/vm.h>
327c478bdstevel@tonic-gate#include <sys/proc.h>
337c478bdstevel@tonic-gate#include <sys/file.h>
347c478bdstevel@tonic-gate#include <sys/conf.h>
357c478bdstevel@tonic-gate#include <sys/kmem.h>
367c478bdstevel@tonic-gate#include <sys/mem.h>
377c478bdstevel@tonic-gate#include <sys/mman.h>
387c478bdstevel@tonic-gate#include <sys/vnode.h>
397c478bdstevel@tonic-gate#include <sys/errno.h>
407c478bdstevel@tonic-gate#include <sys/memlist.h>
417c478bdstevel@tonic-gate#include <sys/dumphdr.h>
427c478bdstevel@tonic-gate#include <sys/dumpadm.h>
437c478bdstevel@tonic-gate#include <sys/ksyms.h>
447c478bdstevel@tonic-gate#include <sys/compress.h>
457c478bdstevel@tonic-gate#include <sys/stream.h>
467c478bdstevel@tonic-gate#include <sys/strsun.h>
477c478bdstevel@tonic-gate#include <sys/cmn_err.h>
487c478bdstevel@tonic-gate#include <sys/bitmap.h>
497c478bdstevel@tonic-gate#include <sys/modctl.h>
507c478bdstevel@tonic-gate#include <sys/utsname.h>
517c478bdstevel@tonic-gate#include <sys/systeminfo.h>
527c478bdstevel@tonic-gate#include <sys/vmem.h>
537c478bdstevel@tonic-gate#include <sys/log.h>
547c478bdstevel@tonic-gate#include <sys/var.h>
557c478bdstevel@tonic-gate#include <sys/debug.h>
567c478bdstevel@tonic-gate#include <sys/sunddi.h>
577c478bdstevel@tonic-gate#include <fs/fs_subr.h>
587c478bdstevel@tonic-gate#include <sys/fs/snode.h>
597c478bdstevel@tonic-gate#include <sys/ontrap.h>
607c478bdstevel@tonic-gate#include <sys/panic.h>
617c478bdstevel@tonic-gate#include <sys/dkio.h>
627c478bdstevel@tonic-gate#include <sys/vtoc.h>
637c478bdstevel@tonic-gate#include <sys/errorq.h>
647c478bdstevel@tonic-gate#include <sys/fm/util.h>
65e7cbe64gw#include <sys/fs/zfs.h>
677c478bdstevel@tonic-gate#include <vm/hat.h>
687c478bdstevel@tonic-gate#include <vm/as.h>
697c478bdstevel@tonic-gate#include <vm/page.h>
70ca3e8d8Dave Plauger#include <vm/pvn.h>
717c478bdstevel@tonic-gate#include <vm/seg.h>
727c478bdstevel@tonic-gate#include <vm/seg_kmem.h>
73d3d5073Rafael Vanoni#include <sys/clock_impl.h>
74b280b2aStuart Maybee#include <sys/hold_page.h>
756ccea42Joyce McIntosh#include <sys/cpu.h>
77ca3e8d8Dave Plauger#include <bzip2/bzlib.h>
78ca3e8d8Dave Plauger
79dfec2ecJohn Levon#define	ONE_GIG	(1024 * 1024 * 1024UL)
80dfec2ecJohn Levon
81ca3e8d8Dave Plauger/*
82ca3e8d8Dave Plauger * Crash dump time is dominated by disk write time.  To reduce this,
83ca3e8d8Dave Plauger * the stronger compression method bzip2 is applied to reduce the dump
84ca3e8d8Dave Plauger * size and hence reduce I/O time.  However, bzip2 is much more
85ca3e8d8Dave Plauger * computationally expensive than the existing lzjb algorithm, so to
86ca3e8d8Dave Plauger * avoid increasing compression time, CPUs that are otherwise idle
87ca3e8d8Dave Plauger * during panic are employed to parallelize the compression task.
88ca3e8d8Dave Plauger * Many helper CPUs are needed to prevent bzip2 from being a
89ca3e8d8Dave Plauger * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
90ca3e8d8Dave Plauger * parallelized instead. Lastly, I/O and compression are performed by
91ca3e8d8Dave Plauger * different CPUs, and are hence overlapped in time, unlike the older
92ca3e8d8Dave Plauger * serial code.
93ca3e8d8Dave Plauger *
94ca3e8d8Dave Plauger * Another important consideration is the speed of the dump
95ca3e8d8Dave Plauger * device. Faster disks need less CPUs in order to benefit from
96ca3e8d8Dave Plauger * parallel lzjb versus parallel bzip2. Therefore, the CPU count
97ca3e8d8Dave Plauger * threshold for switching from parallel lzjb to paralled bzip2 is
98ca3e8d8Dave Plauger * elevated for faster disks. The dump device speed is adduced from
99ca3e8d8Dave Plauger * the setting for dumpbuf.iosize, see dump_update_clevel.
100ca3e8d8Dave Plauger */
101ca3e8d8Dave Plauger
102ca3e8d8Dave Plauger/*
103ca3e8d8Dave Plauger * exported vars
104ca3e8d8Dave Plauger */
105ca3e8d8Dave Plaugerkmutex_t	dump_lock;		/* lock for dump configuration */
106ca3e8d8Dave Plaugerdumphdr_t	*dumphdr;		/* dump header */
1077c478bdstevel@tonic-gateint		dump_conflags = DUMP_KERNEL; /* dump configuration flags */
108ca3e8d8Dave Plaugervnode_t		*dumpvp;		/* dump device vnode pointer */
109ca3e8d8Dave Plaugeru_offset_t	dumpvp_size;		/* size of dump device, in bytes */
110ca3e8d8Dave Plaugerchar		*dumppath;		/* pathname of dump device */
111ca3e8d8Dave Plaugerint		dump_timeout = 120;	/* timeout for dumping pages */
112ca3e8d8Dave Plaugerint		dump_timeleft;		/* portion of dump_timeout remaining */
113ca3e8d8Dave Plaugerint		dump_ioerr;		/* dump i/o error */
114ca3e8d8Dave Plaugerint		dump_check_used;	/* enable check for used pages */
115f6e214cGavin Maltbychar	    *dump_stack_scratch; /* scratch area for saving stack summary */
116ca3e8d8Dave Plauger
117ca3e8d8Dave Plauger/*
118ca3e8d8Dave Plauger * Tunables for dump compression and parallelism. These can be set via
119ca3e8d8Dave Plauger * /etc/system.
120ca3e8d8Dave Plauger *
121ca3e8d8Dave Plauger * dump_ncpu_low	number of helpers for parallel lzjb
122ca3e8d8Dave Plauger *	This is also the minimum configuration.
123ca3e8d8Dave Plauger *
124ca3e8d8Dave Plauger * dump_bzip2_level	bzip2 compression level: 1-9
125ca3e8d8Dave Plauger *	Higher numbers give greater compression, but take more memory
126ca3e8d8Dave Plauger *	and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
127ca3e8d8Dave Plauger *
128ca3e8d8Dave Plauger * dump_plat_mincpu	the cross-over limit for using bzip2 (per platform):
129ca3e8d8Dave Plauger *	if dump_plat_mincpu == 0, then always do single threaded dump
130ca3e8d8Dave Plauger *	if ncpu >= dump_plat_mincpu then try to use bzip2
131ca3e8d8Dave Plauger *
132ca3e8d8Dave Plauger * dump_metrics_on	if set, metrics are collected in the kernel, passed
133ca3e8d8Dave Plauger *	to savecore via the dump file, and recorded by savecore in
134ca3e8d8Dave Plauger *	METRICS.txt.
135ca3e8d8Dave Plauger */
136ca3e8d8Dave Plaugeruint_t dump_ncpu_low = 4;	/* minimum config for parallel lzjb */
137ca3e8d8Dave Plaugeruint_t dump_bzip2_level = 1;	/* bzip2 level (1-9) */
138ca3e8d8Dave Plauger
1394cca9c8Dave Plauger/* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
1404cca9c8Dave Plauger#define	MINCPU_NOT_SET	((uint_t)-1)
1414cca9c8Dave Plaugeruint_t dump_plat_mincpu = MINCPU_NOT_SET;
1424cca9c8Dave Plauger
1439dd77bcDave Plauger/* tunables for pre-reserved heap */
1449dd77bcDave Plaugeruint_t dump_kmem_permap = 1024;
145dfec2ecJohn Levonuint_t dump_kmem_pages = 0;
1469dd77bcDave Plauger
147ca3e8d8Dave Plauger/* Define multiple buffers per helper to avoid stalling */
148ca3e8d8Dave Plauger#define	NCBUF_PER_HELPER	2
149ca3e8d8Dave Plauger#define	NCMAP_PER_HELPER	4
150ca3e8d8Dave Plauger
151ca3e8d8Dave Plauger/* minimum number of helpers configured */
152ca3e8d8Dave Plauger#define	MINHELPERS	(dump_ncpu_low)
153ca3e8d8Dave Plauger#define	MINCBUFS	(MINHELPERS * NCBUF_PER_HELPER)
154ca3e8d8Dave Plauger
155ca3e8d8Dave Plauger/*
156ca3e8d8Dave Plauger * Define constant parameters.
157ca3e8d8Dave Plauger *
158ca3e8d8Dave Plauger * CBUF_SIZE		size of an output buffer
159ca3e8d8Dave Plauger *
160ca3e8d8Dave Plauger * CBUF_MAPSIZE		size of virtual range for mapping pages
161ca3e8d8Dave Plauger *
162ca3e8d8Dave Plauger * CBUF_MAPNP		size of virtual range in pages
163ca3e8d8Dave Plauger *
164ca3e8d8Dave Plauger */
165ca3e8d8Dave Plauger#define	DUMP_1KB	((size_t)1 << 10)
166ca3e8d8Dave Plauger#define	DUMP_1MB	((size_t)1 << 20)
167ca3e8d8Dave Plauger#define	CBUF_SIZE	((size_t)1 << 17)
168ca3e8d8Dave Plauger#define	CBUF_MAPSHIFT	(22)
169ca3e8d8Dave Plauger#define	CBUF_MAPSIZE	((size_t)1 << CBUF_MAPSHIFT)
170ca3e8d8Dave Plauger#define	CBUF_MAPNP	((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
171ca3e8d8Dave Plauger
172ca3e8d8Dave Plauger/*
173ca3e8d8Dave Plauger * Compression metrics are accumulated nano-second subtotals. The
174ca3e8d8Dave Plauger * results are normalized by the number of pages dumped. A report is
175ca3e8d8Dave Plauger * generated when dumpsys() completes and is saved in the dump image
176ca3e8d8Dave Plauger * after the trailing dump header.
177ca3e8d8Dave Plauger *
178ca3e8d8Dave Plauger * Metrics are always collected. Set the variable dump_metrics_on to
179ca3e8d8Dave Plauger * cause metrics to be saved in the crash file, where savecore will
180ca3e8d8Dave Plauger * save it in the file METRICS.txt.
181ca3e8d8Dave Plauger */
182ca3e8d8Dave Plauger#define	PERPAGES \
183ca3e8d8Dave Plauger	PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
184ca3e8d8Dave Plauger	PERPAGE(copy) PERPAGE(compress) \
185ca3e8d8Dave Plauger	PERPAGE(write) \
186ca3e8d8Dave Plauger	PERPAGE(inwait) PERPAGE(outwait)
187ca3e8d8Dave Plauger
188ca3e8d8Dave Plaugertypedef struct perpage {
189ca3e8d8Dave Plauger#define	PERPAGE(x) hrtime_t x;
190ca3e8d8Dave Plauger	PERPAGES
191ca3e8d8Dave Plauger#undef PERPAGE
192ca3e8d8Dave Plauger} perpage_t;
193ca3e8d8Dave Plauger
194ca3e8d8Dave Plauger/*
195ca3e8d8Dave Plauger * This macro controls the code generation for collecting dump
196ca3e8d8Dave Plauger * performance information. By default, the code is generated, but
197ca3e8d8Dave Plauger * automatic saving of the information is disabled. If dump_metrics_on
198ca3e8d8Dave Plauger * is set to 1, the timing information is passed to savecore via the
199ca3e8d8Dave Plauger * crash file, where it is appended to the file dump-dir/METRICS.txt.
200ca3e8d8Dave Plauger */
201ca3e8d8Dave Plauger#define	COLLECT_METRICS
202ca3e8d8Dave Plauger
203ca3e8d8Dave Plauger#ifdef COLLECT_METRICS
204ca3e8d8Dave Plaugeruint_t dump_metrics_on = 0;	/* set to 1 to enable recording metrics */
205ca3e8d8Dave Plauger
206ca3e8d8Dave Plauger#define	HRSTART(v, m)		v##ts.m = gethrtime()
207ca3e8d8Dave Plauger#define	HRSTOP(v, m)		v.m += gethrtime() - v##ts.m
208ca3e8d8Dave Plauger#define	HRBEGIN(v, m, s)	v##ts.m = gethrtime(); v.size += s
209ca3e8d8Dave Plauger#define	HREND(v, m)		v.m += gethrtime() - v##ts.m
210ca3e8d8Dave Plauger#define	HRNORM(v, m, n)		v.m /= (n)
213ca3e8d8Dave Plauger#define	HRSTART(v, m)
214ca3e8d8Dave Plauger#define	HRSTOP(v, m)
215ca3e8d8Dave Plauger#define	HRBEGIN(v, m, s)
216ca3e8d8Dave Plauger#define	HREND(v, m)
217ca3e8d8Dave Plauger#define	HRNORM(v, m, n)
218ca3e8d8Dave Plauger#endif	/* COLLECT_METRICS */
219ca3e8d8Dave Plauger
220ca3e8d8Dave Plauger/*
221ca3e8d8Dave Plauger * Buffers for copying and compressing memory pages.
222ca3e8d8Dave Plauger *
223ca3e8d8Dave Plauger * cbuf_t buffer controllers: used for both input and output.
224ca3e8d8Dave Plauger *
225ca3e8d8Dave Plauger * The buffer state indicates how it is being used:
226ca3e8d8Dave Plauger *
227ca3e8d8Dave Plauger * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
228ca3e8d8Dave Plauger * mapping input pages.
229ca3e8d8Dave Plauger *
230ca3e8d8Dave Plauger * CBUF_INREADY: input pages are mapped and ready for compression by a
231ca3e8d8Dave Plauger * helper.
232ca3e8d8Dave Plauger *
233ca3e8d8Dave Plauger * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
234ca3e8d8Dave Plauger *
235ca3e8d8Dave Plauger * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
236ca3e8d8Dave Plauger *
237ca3e8d8Dave Plauger * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
238ca3e8d8Dave Plauger * ready to write out.
239ca3e8d8Dave Plauger *
240ca3e8d8Dave Plauger * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
241ca3e8d8Dave Plauger * (reports UE errors.)
242ca3e8d8Dave Plauger */
243ca3e8d8Dave Plauger
244ca3e8d8Dave Plaugertypedef enum cbufstate {
245ca3e8d8Dave Plauger	CBUF_FREEMAP,
246ca3e8d8Dave Plauger	CBUF_INREADY,
247ca3e8d8Dave Plauger	CBUF_USEDMAP,
248ca3e8d8Dave Plauger	CBUF_FREEBUF,
249ca3e8d8Dave Plauger	CBUF_WRITE,
250ca3e8d8Dave Plauger	CBUF_ERRMSG
251ca3e8d8Dave Plauger} cbufstate_t;
252ca3e8d8Dave Plauger
253ca3e8d8Dave Plaugertypedef struct cbuf cbuf_t;
254ca3e8d8Dave Plauger
255ca3e8d8Dave Plaugerstruct cbuf {
256ca3e8d8Dave Plauger	cbuf_t *next;			/* next in list */
257ca3e8d8Dave Plauger	cbufstate_t state;		/* processing state */
258ca3e8d8Dave Plauger	size_t used;			/* amount used */
259ca3e8d8Dave Plauger	size_t size;			/* mem size */
260ca3e8d8Dave Plauger	char *buf;			/* kmem or vmem */
261ca3e8d8Dave Plauger	pgcnt_t pagenum;		/* index to pfn map */
262ca3e8d8Dave Plauger	pgcnt_t bitnum;			/* first set bitnum */
263ca3e8d8Dave Plauger	pfn_t pfn;			/* first pfn in mapped range */
264ca3e8d8Dave Plauger	int off;			/* byte offset to first pfn */
265ca3e8d8Dave Plauger};
266ca3e8d8Dave Plauger
267f6e214cGavin Maltbystatic char dump_osimage_uuid[36 + 1];
268f6e214cGavin Maltby
269f6e214cGavin Maltby#define	isdigit(ch)	((ch) >= '0' && (ch) <= '9')
270f6e214cGavin Maltby#define	isxdigit(ch)	(isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
271f6e214cGavin Maltby			((ch) >= 'A' && (ch) <= 'F'))
272f6e214cGavin Maltby
273ca3e8d8Dave Plauger/*
274ca3e8d8Dave Plauger * cqueue_t queues: a uni-directional channel for communication
275ca3e8d8Dave Plauger * from the master to helper tasks or vice-versa using put and
276ca3e8d8Dave Plauger * get primitives. Both mappings and data buffers are passed via
277ca3e8d8Dave Plauger * queues. Producers close a queue when done. The number of
278ca3e8d8Dave Plauger * active producers is reference counted so the consumer can
279ca3e8d8Dave Plauger * detect end of data. Concurrent access is mediated by atomic
280ca3e8d8Dave Plauger * operations for panic dump, or mutex/cv for live dump.
281ca3e8d8Dave Plauger *
282ca3e8d8Dave Plauger * There a four queues, used as follows:
283ca3e8d8Dave Plauger *
284ca3e8d8Dave Plauger * Queue		Dataflow		NewState
285ca3e8d8Dave Plauger * --------------------------------------------------
286ca3e8d8Dave Plauger * mainq		master -> master	FREEMAP
287ca3e8d8Dave Plauger * master has initialized or unmapped an input buffer
288ca3e8d8Dave Plauger * --------------------------------------------------
289ca3e8d8Dave Plauger * helperq		master -> helper	INREADY
290ca3e8d8Dave Plauger * master has mapped input for use by helper
291ca3e8d8Dave Plauger * --------------------------------------------------
292ca3e8d8Dave Plauger * mainq		master <- helper	USEDMAP
293ca3e8d8Dave Plauger * helper is done with input
294ca3e8d8Dave Plauger * --------------------------------------------------
295ca3e8d8Dave Plauger * freebufq		master -> helper	FREEBUF
296ca3e8d8Dave Plauger * master has initialized or written an output buffer
297ca3e8d8Dave Plauger * --------------------------------------------------
298ca3e8d8Dave Plauger * mainq		master <- helper	WRITE
299ca3e8d8Dave Plauger * block of compressed pages from a helper
300ca3e8d8Dave Plauger * --------------------------------------------------
301ca3e8d8Dave Plauger * mainq		master <- helper	ERRMSG
302ca3e8d8Dave Plauger * error messages from a helper (memory error case)
303ca3e8d8Dave Plauger * --------------------------------------------------
304ca3e8d8Dave Plauger * writerq		master <- master	WRITE
305ca3e8d8Dave Plauger * non-blocking queue of blocks to write
306ca3e8d8Dave Plauger * --------------------------------------------------
307ca3e8d8Dave Plauger */
308ca3e8d8Dave Plaugertypedef struct cqueue {
309ca3e8d8Dave Plauger	cbuf_t *volatile first;		/* first in list */
310ca3e8d8Dave Plauger	cbuf_t *last;			/* last in list */
311ca3e8d8Dave Plauger	hrtime_t ts;			/* timestamp */
312ca3e8d8Dave Plauger	hrtime_t empty;			/* total time empty */
313ca3e8d8Dave Plauger	kmutex_t mutex;			/* live state lock */
314ca3e8d8Dave Plauger	kcondvar_t cv;			/* live wait var */
315ca3e8d8Dave Plauger	lock_t spinlock;		/* panic mode spin lock */
316ca3e8d8Dave Plauger	volatile uint_t open;		/* producer ref count */
317ca3e8d8Dave Plauger} cqueue_t;
318ca3e8d8Dave Plauger
319ca3e8d8Dave Plauger/*
320ca3e8d8Dave Plauger * Convenience macros for using the cqueue functions
321ca3e8d8Dave Plauger * Note that the caller must have defined "dumpsync_t *ds"
322ca3e8d8Dave Plauger */
323ca3e8d8Dave Plauger#define	CQ_IS_EMPTY(q)					\
324ca3e8d8Dave Plauger	(ds->q.first == NULL)
325ca3e8d8Dave Plauger
326ca3e8d8Dave Plauger#define	CQ_OPEN(q)					\
327ca3e8d8Dave Plauger	atomic_inc_uint(&ds->q.open)
328ca3e8d8Dave Plauger
329ca3e8d8Dave Plauger#define	CQ_CLOSE(q)					\
330ca3e8d8Dave Plauger	dumpsys_close_cq(&ds->q, ds->live)
331ca3e8d8Dave Plauger
332ca3e8d8Dave Plauger#define	CQ_PUT(q, cp, st)				\
333ca3e8d8Dave Plauger	dumpsys_put_cq(&ds->q, cp, st, ds->live)
334ca3e8d8Dave Plauger
335ca3e8d8Dave Plauger#define	CQ_GET(q)					\
336ca3e8d8Dave Plauger	dumpsys_get_cq(&ds->q, ds->live)
337ca3e8d8Dave Plauger
338ca3e8d8Dave Plauger/*
339ca3e8d8Dave Plauger * Dynamic state when dumpsys() is running.
340ca3e8d8Dave Plauger */
341ca3e8d8Dave Plaugertypedef struct dumpsync {
342ca3e8d8Dave Plauger	pgcnt_t npages;			/* subtotal of pages dumped */
343ca3e8d8Dave Plauger	pgcnt_t pages_mapped;		/* subtotal of pages mapped */
344ca3e8d8Dave Plauger	pgcnt_t pages_used;		/* subtotal of pages used per map */
345ca3e8d8Dave Plauger	size_t nwrite;			/* subtotal of bytes written */
346ca3e8d8Dave Plauger	uint_t live;			/* running live dump */
347ca3e8d8Dave Plauger	uint_t neednl;			/* will need to print a newline */
348ca3e8d8Dave Plauger	uint_t percent;			/* dump progress */
349ca3e8d8Dave Plauger	uint_t percent_done;		/* dump progress reported */
3502fb0949Joshua M. Clulow	int sec_done;			/* dump progress last report time */
351ca3e8d8Dave Plauger	cqueue_t freebufq;		/* free kmem bufs for writing */
352ca3e8d8Dave Plauger	cqueue_t mainq;			/* input for main task */
353ca3e8d8Dave Plauger	cqueue_t helperq;		/* input for helpers */
354ca3e8d8Dave Plauger	cqueue_t writerq;		/* input for writer */
355ca3e8d8Dave Plauger	hrtime_t start;			/* start time */
356ca3e8d8Dave Plauger	hrtime_t elapsed;		/* elapsed time when completed */
357ca3e8d8Dave Plauger	hrtime_t iotime;		/* time spent writing nwrite bytes */
358ca3e8d8Dave Plauger	hrtime_t iowait;		/* time spent waiting for output */
359ca3e8d8Dave Plauger	hrtime_t iowaitts;		/* iowait timestamp */
360ca3e8d8Dave Plauger	perpage_t perpage;		/* metrics */
361ca3e8d8Dave Plauger	perpage_t perpagets;
362ca3e8d8Dave Plauger	int dumpcpu;			/* master cpu */
363ca3e8d8Dave Plauger} dumpsync_t;
364ca3e8d8Dave Plauger
365ca3e8d8Dave Plaugerstatic dumpsync_t dumpsync;		/* synchronization vars */
366ca3e8d8Dave Plauger
367ca3e8d8Dave Plauger/*
368ca3e8d8Dave Plauger * helper_t helpers: contains the context for a stream. CPUs run in
369ca3e8d8Dave Plauger * parallel at dump time; each CPU creates a single stream of
370ca3e8d8Dave Plauger * compression data.  Stream data is divided into CBUF_SIZE blocks.
371ca3e8d8Dave Plauger * The blocks are written in order within a stream. But, blocks from
372ca3e8d8Dave Plauger * multiple streams can be interleaved. Each stream is identified by a
373ca3e8d8Dave Plauger * unique tag.
374ca3e8d8Dave Plauger */
375ca3e8d8Dave Plaugertypedef struct helper {
376ca3e8d8Dave Plauger	int helper;			/* bound helper id */
377ca3e8d8Dave Plauger	int tag;			/* compression stream tag */
378ca3e8d8Dave Plauger	perpage_t perpage;		/* per page metrics */
379ca3e8d8Dave Plauger	perpage_t perpagets;		/* per page metrics (timestamps) */
380ca3e8d8Dave Plauger	taskqid_t taskqid;		/* live dump task ptr */
381ca3e8d8Dave Plauger	int in, out;			/* buffer offsets */
382ca3e8d8Dave Plauger	cbuf_t *cpin, *cpout, *cperr;	/* cbuf objects in process */
383ca3e8d8Dave Plauger	dumpsync_t *ds;			/* pointer to sync vars */
384ca3e8d8Dave Plauger	size_t used;			/* counts input consumed */
385ca3e8d8Dave Plauger	char *page;			/* buffer for page copy */
386ca3e8d8Dave Plauger	char *lzbuf;			/* lzjb output */
387ca3e8d8Dave Plauger	bz_stream bzstream;		/* bzip2 state */
388ca3e8d8Dave Plauger} helper_t;
389ca3e8d8Dave Plauger
390ca3e8d8Dave Plauger#define	MAINHELPER	(-1)		/* helper is also the main task */
391ca3e8d8Dave Plauger#define	FREEHELPER	(-2)		/* unbound helper */
392ca3e8d8Dave Plauger#define	DONEHELPER	(-3)		/* helper finished */
394ca3e8d8Dave Plauger/*
395ca3e8d8Dave Plauger * configuration vars for dumpsys
396ca3e8d8Dave Plauger */
397ca3e8d8Dave Plaugertypedef struct dumpcfg {
398ca3e8d8Dave Plauger	int	threshold;	/* ncpu threshold for bzip2 */
399ca3e8d8Dave Plauger	int	nhelper;	/* number of helpers */
400ca3e8d8Dave Plauger	int	nhelper_used;	/* actual number of helpers used */
401ca3e8d8Dave Plauger	int	ncmap;		/* number VA pages for compression */
402ca3e8d8Dave Plauger	int	ncbuf;		/* number of bufs for compression */
403ca3e8d8Dave Plauger	int	ncbuf_used;	/* number of bufs in use */
404ca3e8d8Dave Plauger	uint_t	clevel;		/* dump compression level */
405ca3e8d8Dave Plauger	helper_t *helper;	/* array of helpers */
406ca3e8d8Dave Plauger	cbuf_t	*cmap;		/* array of input (map) buffers */
407ca3e8d8Dave Plauger	cbuf_t	*cbuf;		/* array of output  buffers */
408ca3e8d8Dave Plauger	ulong_t	*helpermap;	/* set of dumpsys helper CPU ids */
409ca3e8d8Dave Plauger	ulong_t	*bitmap;	/* bitmap for marking pages to dump */
410ca3e8d8Dave Plauger	ulong_t	*rbitmap;	/* bitmap for used CBUF_MAPSIZE ranges */
411ca3e8d8Dave Plauger	pgcnt_t	bitmapsize;	/* size of bitmap */
412ca3e8d8Dave Plauger	pgcnt_t	rbitmapsize;	/* size of bitmap for ranges */
413ca3e8d8Dave Plauger	pgcnt_t found4m;	/* number ranges allocated by dump */
414ca3e8d8Dave Plauger	pgcnt_t foundsm;	/* number small pages allocated by dump */
415ca3e8d8Dave Plauger	pid_t	*pids;		/* list of process IDs at dump time */
416ca3e8d8Dave Plauger	size_t	maxsize;	/* memory size needed at dump time */
417ca3e8d8Dave Plauger	size_t	maxvmsize;	/* size of reserved VM */
418ca3e8d8Dave Plauger	char	*maxvm;		/* reserved VM for spare pages */
419ca3e8d8Dave Plauger	lock_t	helper_lock;	/* protect helper state */
420ca3e8d8Dave Plauger	char	helpers_wanted;	/* flag to enable parallelism */
421ca3e8d8Dave Plauger} dumpcfg_t;
422ca3e8d8Dave Plauger
423ca3e8d8Dave Plaugerstatic dumpcfg_t dumpcfg;	/* config vars */
424ca3e8d8Dave Plauger
425ca3e8d8Dave Plauger/*
426ca3e8d8Dave Plauger * The dump I/O buffer.
427ca3e8d8Dave Plauger *
428ca3e8d8Dave Plauger * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
429ca3e8d8Dave Plauger * sized according to the optimum device transfer speed.
430ca3e8d8Dave Plauger */
431ca3e8d8Dave Plaugertypedef struct dumpbuf {
432ca3e8d8Dave Plauger	vnode_t	*cdev_vp;	/* VCHR open of the dump device */
433ca3e8d8Dave Plauger	len_t	vp_limit;	/* maximum write offset */
434ca3e8d8Dave Plauger	offset_t vp_off;	/* current dump device offset */
435ca3e8d8Dave Plauger	char	*cur;		/* dump write pointer */
436ca3e8d8Dave Plauger	char	*start;		/* dump buffer address */
437ca3e8d8Dave Plauger	char	*end;		/* dump buffer end */
438ca3e8d8Dave Plauger	size_t	size;		/* size of dumpbuf in bytes */
439ca3e8d8Dave Plauger	size_t	iosize;		/* best transfer size for device */
440ca3e8d8Dave Plauger} dumpbuf_t;
441ca3e8d8Dave Plauger
442ca3e8d8Dave Plaugerdumpbuf_t dumpbuf;		/* I/O buffer */
443ca3e8d8Dave Plauger
444ca3e8d8Dave Plauger/*
4456ccea42Joyce McIntosh * For parallel dump, defines maximum time main task thread will wait
4466ccea42Joyce McIntosh * for at least one helper to register in dumpcfg.helpermap, before
4476ccea42Joyce McIntosh * assuming there are no helpers and falling back to serial mode.
4486ccea42Joyce McIntosh * Value is chosen arbitrary and provides *really* long wait for any
4496ccea42Joyce McIntosh * available helper to register.
4506ccea42Joyce McIntosh */
4516ccea42Joyce McIntosh#define	DUMP_HELPER_MAX_WAIT	1000	/* millisec */
4526ccea42Joyce McIntosh
4536ccea42Joyce McIntosh/*
454ca3e8d8Dave Plauger * The dump I/O buffer must be at least one page, at most xfer_size
455ca3e8d8Dave Plauger * bytes, and should scale with physmem in between.  The transfer size
456ca3e8d8Dave Plauger * passed in will either represent a global default (maxphys) or the
457ca3e8d8Dave Plauger * best size for the device.  The size of the dumpbuf I/O buffer is
458ca3e8d8Dave Plauger * limited by dumpbuf_limit (8MB by default) because the dump
459ca3e8d8Dave Plauger * performance saturates beyond a certain size.  The default is to
460ca3e8d8Dave Plauger * select 1/4096 of the memory.
4617c478bdstevel@tonic-gate */
462ca3e8d8Dave Plaugerstatic int	dumpbuf_fraction = 12;	/* memory size scale factor */
463ca3e8d8Dave Plaugerstatic size_t	dumpbuf_limit = 8 * DUMP_1MB;	/* max I/O buf size */
464ca3e8d8Dave Plauger
4657c478bdstevel@tonic-gatestatic size_t
4667c478bdstevel@tonic-gatedumpbuf_iosize(size_t xfer_size)
468ca3e8d8Dave Plauger	size_t iosize = ptob(physmem >> dumpbuf_fraction);
469ca3e8d8Dave Plauger
470ca3e8d8Dave Plauger	if (iosize < PAGESIZE)
471ca3e8d8Dave Plauger		iosize = PAGESIZE;
472ca3e8d8Dave Plauger	else if (iosize > xfer_size)
473ca3e8d8Dave Plauger		iosize = xfer_size;
474ca3e8d8Dave Plauger	if (iosize > dumpbuf_limit)
475ca3e8d8Dave Plauger		iosize = dumpbuf_limit;
476ca3e8d8Dave Plauger	return (iosize & PAGEMASK);
479ca3e8d8Dave Plauger/*
480ca3e8d8Dave Plauger * resize the I/O buffer
481ca3e8d8Dave Plauger */
4827c478bdstevel@tonic-gatestatic void
485ca3e8d8Dave Plauger	char *old_buf = dumpbuf.start;
486ca3e8d8Dave Plauger	size_t old_size = dumpbuf.size;
4877c478bdstevel@tonic-gate	char *new_buf;
4887c478bdstevel@tonic-gate	size_t new_size;
4907c478bdstevel@tonic-gate	ASSERT(MUTEX_HELD(&dump_lock));
492ca3e8d8Dave Plauger	new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
493ca3e8d8Dave Plauger	if (new_size <= old_size)
4947c478bdstevel@tonic-gate		return; /* no need to reallocate buffer */
4967c478bdstevel@tonic-gate	new_buf = kmem_alloc(new_size, KM_SLEEP);
497ca3e8d8Dave Plauger	dumpbuf.size = new_size;
498ca3e8d8Dave Plauger	dumpbuf.start = new_buf;
499ca3e8d8Dave Plauger	dumpbuf.end = new_buf + new_size;
5007c478bdstevel@tonic-gate	kmem_free(old_buf, old_size);
503ca3e8d8Dave Plauger/*
504ca3e8d8Dave Plauger * dump_update_clevel is called when dumpadm configures the dump device.
5056ccea42Joyce McIntosh *	Calculate number of helpers and buffers.
5066ccea42Joyce McIntosh *	Allocate the minimum configuration for now.
507ca3e8d8Dave Plauger *
508ca3e8d8Dave Plauger * When the dump file is configured we reserve a minimum amount of
509ca3e8d8Dave Plauger * memory for use at crash time. But we reserve VA for all the memory
510ca3e8d8Dave Plauger * we really want in order to do the fastest dump possible. The VA is
511ca3e8d8Dave Plauger * backed by pages not being dumped, according to the bitmap. If
512ca3e8d8Dave Plauger * there is insufficient spare memory, however, we fall back to the
513ca3e8d8Dave Plauger * minimum.
514ca3e8d8Dave Plauger *
515ca3e8d8Dave Plauger * Live dump (savecore -L) always uses the minimum config.
516ca3e8d8Dave Plauger *
517ca3e8d8Dave Plauger * clevel 0 is single threaded lzjb
518ca3e8d8Dave Plauger * clevel 1 is parallel lzjb
519ca3e8d8Dave Plauger * clevel 2 is parallel bzip2
520ca3e8d8Dave Plauger *
521ca3e8d8Dave Plauger * The ncpu threshold is selected with dump_plat_mincpu.
522ca3e8d8Dave Plauger * On OPL, set_platform_defaults() overrides the sun4u setting.
523ca3e8d8Dave Plauger * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
524ca3e8d8Dave Plauger *
525ca3e8d8Dave Plauger * Architecture		Threshold	Algorithm
5266ccea42Joyce McIntosh * sun4u		<  51		parallel lzjb
5276ccea42Joyce McIntosh * sun4u		>= 51		parallel bzip2(*)
5286ccea42Joyce McIntosh * sun4u OPL		<  8		parallel lzjb
5296ccea42Joyce McIntosh * sun4u OPL		>= 8		parallel bzip2(*)
5306ccea42Joyce McIntosh * sun4v		<  128		parallel lzjb
5316ccea42Joyce McIntosh * sun4v		>= 128		parallel bzip2(*)
532ca3e8d8Dave Plauger * x86			< 11		parallel lzjb
533ca3e8d8Dave Plauger * x86			>= 11		parallel bzip2(*)
5346ccea42Joyce McIntosh * 32-bit		N/A		single-threaded lzjb
535ca3e8d8Dave Plauger *
536ca3e8d8Dave Plauger * (*) bzip2 is only chosen if there is sufficient available
537ca3e8d8Dave Plauger * memory for buffers at dump time. See dumpsys_get_maxmem().
538ca3e8d8Dave Plauger *
539ca3e8d8Dave Plauger * Faster dump devices have larger I/O buffers. The threshold value is
540ca3e8d8Dave Plauger * increased according to the size of the dump I/O buffer, because
541ca3e8d8Dave Plauger * parallel lzjb performs better with faster disks. For buffers >= 1MB
542ca3e8d8Dave Plauger * the threshold is 3X; for buffers >= 256K threshold is 2X.
543ca3e8d8Dave Plauger *
544ca3e8d8Dave Plauger * For parallel dumps, the number of helpers is ncpu-1. The CPU
545ca3e8d8Dave Plauger * running panic runs the main task. For single-threaded dumps, the
546ca3e8d8Dave Plauger * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
547ca3e8d8Dave Plauger *
548ca3e8d8Dave Plauger * Need multiple buffers per helper so that they do not block waiting
549ca3e8d8Dave Plauger * for the main task.
550ca3e8d8Dave Plauger *				parallel	single-threaded
551ca3e8d8Dave Plauger * Number of output buffers:	nhelper*2		1
552ca3e8d8Dave Plauger * Number of mapping buffers:	nhelper*4		1
553ca3e8d8Dave Plauger *
554ca3e8d8Dave Plauger */
555ca3e8d8Dave Plaugerstatic void
556ca3e8d8Dave Plaugerdump_update_clevel()
557ca3e8d8Dave Plauger{
558ca3e8d8Dave Plauger	int tag;
559ca3e8d8Dave Plauger	size_t bz2size;
560ca3e8d8Dave Plauger	helper_t *hp, *hpend;
561ca3e8d8Dave Plauger	cbuf_t *cp, *cpend;
562ca3e8d8Dave Plauger	dumpcfg_t *old = &dumpcfg;
563ca3e8d8Dave Plauger	dumpcfg_t newcfg = *old;
564ca3e8d8Dave Plauger	dumpcfg_t *new = &newcfg;
565ca3e8d8Dave Plauger
566ca3e8d8Dave Plauger	ASSERT(MUTEX_HELD(&dump_lock));
567ca3e8d8Dave Plauger
568ca3e8d8Dave Plauger	/*
569ca3e8d8Dave Plauger	 * Free the previously allocated bufs and VM.
570ca3e8d8Dave Plauger	 */
571ca3e8d8Dave Plauger	if (old->helper != NULL) {
572ca3e8d8Dave Plauger
573ca3e8d8Dave Plauger		/* helpers */
574ca3e8d8Dave Plauger		hpend = &old->helper[old->nhelper];
575ca3e8d8Dave Plauger		for (hp = old->helper; hp != hpend; hp++) {
576ca3e8d8Dave Plauger			if (hp->lzbuf != NULL)
577ca3e8d8Dave Plauger				kmem_free(hp->lzbuf, PAGESIZE);
578ca3e8d8Dave Plauger			if (hp->page != NULL)
579ca3e8d8Dave Plauger				kmem_free(hp->page, PAGESIZE);
580ca3e8d8Dave Plauger		}
581ca3e8d8Dave Plauger		kmem_free(old->helper, old->nhelper * sizeof (helper_t));
582ca3e8d8Dave Plauger
583ca3e8d8Dave Plauger		/* VM space for mapping pages */
584ca3e8d8Dave Plauger		cpend = &old->cmap[old->ncmap];
585ca3e8d8Dave Plauger		for (cp = old->cmap; cp != cpend; cp++)
586ca3e8d8Dave Plauger			vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
587ca3e8d8Dave Plauger		kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
588ca3e8d8Dave Plauger
589ca3e8d8Dave Plauger		/* output bufs */
590ca3e8d8Dave Plauger		cpend = &old->cbuf[old->ncbuf];
591ca3e8d8Dave Plauger		for (cp = old->cbuf; cp != cpend; cp++)
592ca3e8d8Dave Plauger			if (cp->buf != NULL)
593ca3e8d8Dave Plauger				kmem_free(cp->buf, cp->size);
594ca3e8d8Dave Plauger		kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
595ca3e8d8Dave Plauger
596ca3e8d8Dave Plauger		/* reserved VM for dumpsys_get_maxmem */
597ca3e8d8Dave Plauger		if (old->maxvmsize > 0)
598ca3e8d8Dave Plauger			vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
599ca3e8d8Dave Plauger	}
600ca3e8d8Dave Plauger
601ca3e8d8Dave Plauger	/*
602ca3e8d8Dave Plauger	 * Allocate memory and VM.
603ca3e8d8Dave Plauger	 * One CPU runs dumpsys, the rest are helpers.
604ca3e8d8Dave Plauger	 */
605ca3e8d8Dave Plauger	new->nhelper = ncpus - 1;
606ca3e8d8Dave Plauger	if (new->nhelper < 1)
607ca3e8d8Dave Plauger		new->nhelper = 1;
608ca3e8d8Dave Plauger
609ca3e8d8Dave Plauger	if (new->nhelper > DUMP_MAX_NHELPER)
610ca3e8d8Dave Plauger		new->nhelper = DUMP_MAX_NHELPER;
611ca3e8d8Dave Plauger
6124cca9c8Dave Plauger	/* use platform default, unless /etc/system overrides */
6134cca9c8Dave Plauger	if (dump_plat_mincpu == MINCPU_NOT_SET)
6144cca9c8Dave Plauger		dump_plat_mincpu = dump_plat_mincpu_default;
6154cca9c8Dave Plauger
616ca3e8d8Dave Plauger	/* increase threshold for faster disks */
617ca3e8d8Dave Plauger	new->threshold = dump_plat_mincpu;
618ca3e8d8Dave Plauger	if (dumpbuf.iosize >= DUMP_1MB)
619ca3e8d8Dave Plauger		new->threshold *= 3;
620ca3e8d8Dave Plauger	else if (dumpbuf.iosize >= (256 * DUMP_1KB))
621ca3e8d8Dave Plauger		new->threshold *= 2;
622ca3e8d8Dave Plauger
623ca3e8d8Dave Plauger	/* figure compression level based upon the computed threshold. */
624ca3e8d8Dave Plauger	if (dump_plat_mincpu == 0 || new->nhelper < 2) {
625ca3e8d8Dave Plauger		new->clevel = 0;
626ca3e8d8Dave Plauger		new->nhelper = 1;
627ca3e8d8Dave Plauger	} else if ((new->nhelper + 1) >= new->threshold) {
628ca3e8d8Dave Plauger		new->clevel = DUMP_CLEVEL_BZIP2;
629ca3e8d8Dave Plauger	} else {
630ca3e8d8Dave Plauger		new->clevel = DUMP_CLEVEL_LZJB;
631ca3e8d8Dave Plauger	}
632ca3e8d8Dave Plauger
633ca3e8d8Dave Plauger	if (new->clevel == 0) {
634ca3e8d8Dave Plauger		new->ncbuf = 1;
635ca3e8d8Dave Plauger		new->ncmap = 1;
636ca3e8d8Dave Plauger	} else {
637ca3e8d8Dave Plauger		new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
638ca3e8d8Dave Plauger		new->ncmap = NCMAP_PER_HELPER * new->nhelper;
639ca3e8d8Dave Plauger	}
640ca3e8d8Dave Plauger
641ca3e8d8Dave Plauger	/*
642ca3e8d8Dave Plauger	 * Allocate new data structures and buffers for MINHELPERS,
643ca3e8d8Dave Plauger	 * and also figure the max desired size.
644ca3e8d8Dave Plauger	 */
645ca3e8d8Dave Plauger	bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
646ca3e8d8Dave Plauger	new->maxsize = 0;
647ca3e8d8Dave Plauger	new->maxvmsize = 0;
648ca3e8d8Dave Plauger	new->maxvm = NULL;
649ca3e8d8Dave Plauger	tag = 1;
650ca3e8d8Dave Plauger	new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
651ca3e8d8Dave Plauger	hpend = &new->helper[new->nhelper];
652ca3e8d8Dave Plauger	for (hp = new->helper; hp != hpend; hp++) {
653ca3e8d8Dave Plauger		hp->tag = tag++;
654ca3e8d8Dave Plauger		if (hp < &new->helper[MINHELPERS]) {
655ca3e8d8Dave Plauger			hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
656ca3e8d8Dave Plauger			hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
657ca3e8d8Dave Plauger		} else if (new->clevel < DUMP_CLEVEL_BZIP2) {
658ca3e8d8Dave Plauger			new->maxsize += 2 * PAGESIZE;
659ca3e8d8Dave Plauger		} else {
660ca3e8d8Dave Plauger			new->maxsize += PAGESIZE;
661ca3e8d8Dave Plauger		}
662ca3e8d8Dave Plauger		if (new->clevel >= DUMP_CLEVEL_BZIP2)
663ca3e8d8Dave Plauger			new->maxsize += bz2size;
664ca3e8d8Dave Plauger	}
665ca3e8d8Dave Plauger
666ca3e8d8Dave Plauger	new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
667ca3e8d8Dave Plauger	cpend = &new->cbuf[new->ncbuf];
668ca3e8d8Dave Plauger	for (cp = new->cbuf; cp != cpend; cp++) {
669ca3e8d8Dave Plauger		cp->state = CBUF_FREEBUF;
670ca3e8d8Dave Plauger		cp->size = CBUF_SIZE;
671ca3e8d8Dave Plauger		if (cp < &new->cbuf[MINCBUFS])
672ca3e8d8Dave Plauger			cp->buf = kmem_alloc(cp->size, KM_SLEEP);
673ca3e8d8Dave Plauger		else
674ca3e8d8Dave Plauger			new->maxsize += cp->size;
675ca3e8d8Dave Plauger	}
676ca3e8d8Dave Plauger
677ca3e8d8Dave Plauger	new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
678ca3e8d8Dave Plauger	cpend = &new->cmap[new->ncmap];
679ca3e8d8Dave Plauger	for (cp = new->cmap; cp != cpend; cp++) {
680ca3e8d8Dave Plauger		cp->state = CBUF_FREEMAP;
681ca3e8d8Dave Plauger		cp->size = CBUF_MAPSIZE;
682ca3e8d8Dave Plauger		cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
683ca3e8d8Dave Plauger		    0, 0, NULL, NULL, VM_SLEEP);
684ca3e8d8Dave Plauger	}
685ca3e8d8Dave Plauger
686ca3e8d8Dave Plauger	/* reserve VA to be backed with spare pages at crash time */
687ca3e8d8Dave Plauger	if (new->maxsize > 0) {
688ca3e8d8Dave Plauger		new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
689ca3e8d8Dave Plauger		new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
690ca3e8d8Dave Plauger		new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
691ca3e8d8Dave Plauger		    CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
692ca3e8d8Dave Plauger	}
693ca3e8d8Dave Plauger
6949dd77bcDave Plauger	/*
695dfec2ecJohn Levon	 * Reserve memory for kmem allocation calls made during crash dump.  The
696dfec2ecJohn Levon	 * hat layer allocates memory for each mapping created, and the I/O path
697dfec2ecJohn Levon	 * allocates buffers and data structs.
698dfec2ecJohn Levon	 *
699dfec2ecJohn Levon	 * On larger systems, we easily exceed the lower amount, so we need some
700dfec2ecJohn Levon	 * more space; the cut-over point is relatively arbitrary.  If we run
701dfec2ecJohn Levon	 * out, the only impact is that kmem state in the dump becomes
702dfec2ecJohn Levon	 * inconsistent.
7039dd77bcDave Plauger	 */
704dfec2ecJohn Levon
705dfec2ecJohn Levon	if (dump_kmem_pages == 0) {
706dfec2ecJohn Levon		if (physmem > (16 * ONE_GIG) / PAGESIZE)
707dfec2ecJohn Levon			dump_kmem_pages = 20;
708dfec2ecJohn Levon		else
709dfec2ecJohn Levon			dump_kmem_pages = 8;
710dfec2ecJohn Levon	}
711dfec2ecJohn Levon
7129dd77bcDave Plauger	kmem_dump_init((new->ncmap * dump_kmem_permap) +
7139dd77bcDave Plauger	    (dump_kmem_pages * PAGESIZE));
7149dd77bcDave Plauger
715ca3e8d8Dave Plauger	/* set new config pointers */
716ca3e8d8Dave Plauger	*old = *new;
717ca3e8d8Dave Plauger}
718ca3e8d8Dave Plauger
719ca3e8d8Dave Plauger/*
720ca3e8d8Dave Plauger * Define a struct memlist walker to optimize bitnum to pfn
721ca3e8d8Dave Plauger * lookup. The walker maintains the state of the list traversal.
722ca3e8d8Dave Plauger */
723ca3e8d8Dave Plaugertypedef struct dumpmlw {
724ca3e8d8Dave Plauger	struct memlist	*mp;		/* current memlist */
725ca3e8d8Dave Plauger	pgcnt_t		basenum;	/* bitnum base offset */
726ca3e8d8Dave Plauger	pgcnt_t		mppages;	/* current memlist size */
727ca3e8d8Dave Plauger	pgcnt_t		mpleft;		/* size to end of current memlist */
728ca3e8d8Dave Plauger	pfn_t		mpaddr;		/* first pfn in memlist */
729ca3e8d8Dave Plauger} dumpmlw_t;
730ca3e8d8Dave Plauger
731ca3e8d8Dave Plauger/* initialize the walker */
732ca3e8d8Dave Plaugerstatic inline void
733ca3e8d8Dave Plaugerdump_init_memlist_walker(dumpmlw_t *pw)
734ca3e8d8Dave Plauger{
735ca3e8d8Dave Plauger	pw->mp = phys_install;
736ca3e8d8Dave Plauger	pw->basenum = 0;
73756f3320Jonathan Adams	pw->mppages = pw->mp->ml_size >> PAGESHIFT;
738ca3e8d8Dave Plauger	pw->mpleft = pw->mppages;
73956f3320Jonathan Adams	pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
740ca3e8d8Dave Plauger}
741ca3e8d8Dave Plauger
742ca3e8d8Dave Plauger/*
743ca3e8d8Dave Plauger * Lookup pfn given bitnum. The memlist can be quite long on some
744ca3e8d8Dave Plauger * systems (e.g.: one per board). To optimize sequential lookups, the
745ca3e8d8Dave Plauger * caller initializes and presents a memlist walker.
746ca3e8d8Dave Plauger */
747ca3e8d8Dave Plaugerstatic pfn_t
748ca3e8d8Dave Plaugerdump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
749ca3e8d8Dave Plauger{
750ca3e8d8Dave Plauger	bitnum -= pw->basenum;
751ca3e8d8Dave Plauger	while (pw->mp != NULL) {
752ca3e8d8Dave Plauger		if (bitnum < pw->mppages) {
753ca3e8d8Dave Plauger			pw->mpleft = pw->mppages - bitnum;
754ca3e8d8Dave Plauger			return (pw->mpaddr + bitnum);
755ca3e8d8Dave Plauger		}
756ca3e8d8Dave Plauger		bitnum -= pw->mppages;
757ca3e8d8Dave Plauger		pw->basenum += pw->mppages;
75856f3320Jonathan Adams		pw->mp = pw->mp->ml_next;
759ca3e8d8Dave Plauger		if (pw->mp != NULL) {
76056f3320Jonathan Adams			pw->mppages = pw->mp->ml_size >> PAGESHIFT;
761ca3e8d8Dave Plauger			pw->mpleft = pw->mppages;
76256f3320Jonathan Adams			pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
763ca3e8d8Dave Plauger		}
764ca3e8d8Dave Plauger	}
765ca3e8d8Dave Plauger	return (PFN_INVALID);
766ca3e8d8Dave Plauger}
767ca3e8d8Dave Plauger
768ca3e8d8Dave Plaugerstatic pgcnt_t
769ca3e8d8Dave Plaugerdump_pfn_to_bitnum(pfn_t pfn)
770ca3e8d8Dave Plauger{
771ca3e8d8Dave Plauger	struct memlist *mp;
772ca3e8d8Dave Plauger	pgcnt_t bitnum = 0;
773ca3e8d8Dave Plauger
77456f3320Jonathan Adams	for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
77556f3320Jonathan Adams		if (pfn >= (mp->ml_address >> PAGESHIFT) &&
77656f3320Jonathan Adams		    pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
77756f3320Jonathan Adams			return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
77856f3320Jonathan Adams		bitnum += mp->ml_size >> PAGESHIFT;
779ca3e8d8Dave Plauger	}
780ca3e8d8Dave Plauger	return ((pgcnt_t)-1);
781ca3e8d8Dave Plauger}
782ca3e8d8Dave Plauger
783ca3e8d8Dave Plauger/*
784ca3e8d8Dave Plauger * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
785ca3e8d8Dave Plauger * mapping of pfn to range index is imperfect because pfn and bitnum
786ca3e8d8Dave Plauger * do not have the same phase. To make sure a CBUF_MAPSIZE range is
787ca3e8d8Dave Plauger * covered, call this for both ends:
788ca3e8d8Dave Plauger *	dump_set_used(base)
789ca3e8d8Dave Plauger *	dump_set_used(base+CBUF_MAPNP-1)
790ca3e8d8Dave Plauger *
791ca3e8d8Dave Plauger * This is used during a panic dump to mark pages allocated by
792ca3e8d8Dave Plauger * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
793ca3e8d8Dave Plauger * page_get_mnode_freelist() to make sure pages used by dump are never
794ca3e8d8Dave Plauger * allocated.
795ca3e8d8Dave Plauger */
796ca3e8d8Dave Plauger#define	CBUF_MAPP2R(pfn)	((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
797ca3e8d8Dave Plauger
798ca3e8d8Dave Plaugerstatic void
799ca3e8d8Dave Plaugerdump_set_used(pfn_t pfn)
800ca3e8d8Dave Plauger{
801ca3e8d8Dave Plauger
802ca3e8d8Dave Plauger	pgcnt_t bitnum, rbitnum;
803ca3e8d8Dave Plauger
804ca3e8d8Dave Plauger	bitnum = dump_pfn_to_bitnum(pfn);
805ca3e8d8Dave Plauger	ASSERT(bitnum != (pgcnt_t)-1);
806ca3e8d8Dave Plauger
807ca3e8d8Dave Plauger	rbitnum = CBUF_MAPP2R(bitnum);
808ca3e8d8Dave Plauger	ASSERT(rbitnum < dumpcfg.rbitmapsize);
809ca3e8d8Dave Plauger
810ca3e8d8Dave Plauger	BT_SET(dumpcfg.rbitmap, rbitnum);
811ca3e8d8Dave Plauger}
812ca3e8d8Dave Plauger
813ca3e8d8Dave Plaugerint
814ca3e8d8Dave Plaugerdump_test_used(pfn_t pfn)
815ca3e8d8Dave Plauger{
816ca3e8d8Dave Plauger	pgcnt_t bitnum, rbitnum;
817ca3e8d8Dave Plauger
818ca3e8d8Dave Plauger	bitnum = dump_pfn_to_bitnum(pfn);
819ca3e8d8Dave Plauger	ASSERT(bitnum != (pgcnt_t)-1);
820ca3e8d8Dave Plauger
821ca3e8d8Dave Plauger	rbitnum = CBUF_MAPP2R(bitnum);
822ca3e8d8Dave Plauger	ASSERT(rbitnum < dumpcfg.rbitmapsize);
823ca3e8d8Dave Plauger
824ca3e8d8Dave Plauger	return (BT_TEST(dumpcfg.rbitmap, rbitnum));
825ca3e8d8Dave Plauger}
826ca3e8d8Dave Plauger
827ca3e8d8Dave Plauger/*
828ca3e8d8Dave Plauger * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library.
829ca3e8d8Dave Plauger * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit().
830ca3e8d8Dave Plauger */
831ca3e8d8Dave Plaugerstatic void *
832ca3e8d8Dave Plaugerdumpbzalloc(void *opaque, int items, int size)
833ca3e8d8Dave Plauger{
834ca3e8d8Dave Plauger	size_t *sz;
835ca3e8d8Dave Plauger	char *ret;
836ca3e8d8Dave Plauger
837ca3e8d8Dave Plauger	ASSERT(opaque != NULL);
838ca3e8d8Dave Plauger	sz = opaque;
839ca3e8d8Dave Plauger	ret = dumpcfg.maxvm + *sz;
840ca3e8d8Dave Plauger	*sz += items * size;
841ca3e8d8Dave Plauger	*sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN);
842ca3e8d8Dave Plauger	ASSERT(*sz <= dumpcfg.maxvmsize);
843ca3e8d8Dave Plauger	return (ret);
844ca3e8d8Dave Plauger}
845ca3e8d8Dave Plauger
846ca3e8d8Dave Plauger/*ARGSUSED*/
847ca3e8d8Dave Plaugerstatic void
848ca3e8d8Dave Plaugerdumpbzfree(void *opaque, void *addr)
849ca3e8d8Dave Plauger{
850ca3e8d8Dave Plauger}
851ca3e8d8Dave Plauger
852ca3e8d8Dave Plauger/*
853ca3e8d8Dave Plauger * Perform additional checks on the page to see if we can really use
854ca3e8d8Dave Plauger * it. The kernel (kas) pages are always set in the bitmap. However,
855ca3e8d8Dave Plauger * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
856ca3e8d8Dave Plauger * bitmap. So we check for them.
857ca3e8d8Dave Plauger */
858ca3e8d8Dave Plaugerstatic inline int
859ca3e8d8Dave Plaugerdump_pfn_check(pfn_t pfn)
860ca3e8d8Dave Plauger{
861ca3e8d8Dave Plauger	page_t *pp = page_numtopp_nolock(pfn);
862ca3e8d8Dave Plauger	if (pp == NULL || pp->p_pagenum != pfn ||
863ca3e8d8Dave Plauger#if defined(__sparc)
864af4c679Sean McEnroe	    pp->p_vnode == &promvp ||
865ca3e8d8Dave Plauger#else
866ca3e8d8Dave Plauger	    PP_ISBOOTPAGES(pp) ||
867ca3e8d8Dave Plauger#endif
868ca3e8d8Dave Plauger	    pp->p_toxic != 0)
869ca3e8d8Dave Plauger		return (0);
870ca3e8d8Dave Plauger	return (1);
871ca3e8d8Dave Plauger}
872ca3e8d8Dave Plauger
873ca3e8d8Dave Plauger/*
874ca3e8d8Dave Plauger * Check a range to see if all contained pages are available and
875ca3e8d8Dave Plauger * return non-zero if the range can be used.
876ca3e8d8Dave Plauger */
877ca3e8d8Dave Plaugerstatic inline int
878ca3e8d8Dave Plaugerdump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
879ca3e8d8Dave Plauger{
880ca3e8d8Dave Plauger	for (; start < end; start++, pfn++) {
881ca3e8d8Dave Plauger		if (BT_TEST(dumpcfg.bitmap, start))
882ca3e8d8Dave Plauger			return (0);
883ca3e8d8Dave Plauger		if (!dump_pfn_check(pfn))
884ca3e8d8Dave Plauger			return (0);
885ca3e8d8Dave Plauger	}
886ca3e8d8Dave Plauger	return (1);
887ca3e8d8Dave Plauger}
888ca3e8d8Dave Plauger
889ca3e8d8Dave Plauger/*
890ca3e8d8Dave Plauger * dumpsys_get_maxmem() is called during panic. Find unused ranges
891ca3e8d8Dave Plauger * and use them for buffers. If we find enough memory switch to
892ca3e8d8Dave Plauger * parallel bzip2, otherwise use parallel lzjb.
893ca3e8d8Dave Plauger *
894ca3e8d8Dave Plauger * It searches the dump bitmap in 2 passes. The first time it looks
895ca3e8d8Dave Plauger * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
896ca3e8d8Dave Plauger */
897ca3e8d8Dave Plaugerstatic void
898ca3e8d8Dave Plaugerdumpsys_get_maxmem()
899ca3e8d8Dave Plauger{
900ca3e8d8Dave Plauger	dumpcfg_t *cfg = &dumpcfg;
901ca3e8d8Dave Plauger	cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
902ca3e8d8Dave Plauger	helper_t *endhp = &cfg->helper[cfg->nhelper];
903ca3e8d8Dave Plauger	pgcnt_t bitnum, end;
904ca3e8d8Dave Plauger	size_t sz, endsz, bz2size;
905ca3e8d8Dave Plauger	pfn_t pfn, off;
906ca3e8d8Dave Plauger	cbuf_t *cp;
907ca3e8d8Dave Plauger	helper_t *hp, *ohp;
908ca3e8d8Dave Plauger	dumpmlw_t mlw;
909ca3e8d8Dave Plauger	int k;
910ca3e8d8Dave Plauger
9114cca9c8Dave Plauger	/*
9129c84d1aDave Plauger	 * Setting dump_plat_mincpu to 0 at any time forces a serial
9139c84d1aDave Plauger	 * dump.
9144cca9c8Dave Plauger	 */
9159c84d1aDave Plauger	if (dump_plat_mincpu == 0) {
9164cca9c8Dave Plauger		cfg->clevel = 0;
9174cca9c8Dave Plauger		return;
9184cca9c8Dave Plauger	}
9194cca9c8Dave Plauger
9204cca9c8Dave Plauger	/*
9214cca9c8Dave Plauger	 * There may be no point in looking for spare memory. If
9224cca9c8Dave Plauger	 * dumping all memory, then none is spare. If doing a serial
9234cca9c8Dave Plauger	 * dump, then already have buffers.
9244cca9c8Dave Plauger	 */
925ca3e8d8Dave Plauger	if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB ||
92686f2194Dave Plauger	    (dump_conflags & DUMP_ALL) != 0) {
92786f2194Dave Plauger		if (cfg->clevel > DUMP_CLEVEL_LZJB)
92886f2194Dave Plauger			cfg->clevel = DUMP_CLEVEL_LZJB;
929ca3e8d8Dave Plauger		return;
93086f2194Dave Plauger	}
931ca3e8d8Dave Plauger
932ca3e8d8Dave Plauger	sz = 0;
933ca3e8d8Dave Plauger	cfg->found4m = 0;
934ca3e8d8Dave Plauger	cfg->foundsm = 0;
935ca3e8d8Dave Plauger
936ca3e8d8Dave Plauger	/* bitmap of ranges used to estimate which pfns are being used */
937ca3e8d8Dave Plauger	bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
938ca3e8d8Dave Plauger
939ca3e8d8Dave Plauger	/* find ranges that are not being dumped to use for buffers */
940ca3e8d8Dave Plauger	dump_init_memlist_walker(&mlw);
941ca3e8d8Dave Plauger	for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
942ca3e8d8Dave Plauger		dump_timeleft = dump_timeout;
943ca3e8d8Dave Plauger		end = bitnum + CBUF_MAPNP;
944ca3e8d8Dave Plauger		pfn = dump_bitnum_to_pfn(bitnum, &mlw);
945ca3e8d8Dave Plauger		ASSERT(pfn != PFN_INVALID);
946ca3e8d8Dave Plauger
947ca3e8d8Dave Plauger		/* skip partial range at end of mem segment */
948ca3e8d8Dave Plauger		if (mlw.mpleft < CBUF_MAPNP) {
949ca3e8d8Dave Plauger			end = bitnum + mlw.mpleft;
950ca3e8d8Dave Plauger			continue;
951ca3e8d8Dave Plauger		}
952ca3e8d8Dave Plauger
953ca3e8d8Dave Plauger		/* skip non aligned pages */
954ca3e8d8Dave Plauger		off = P2PHASE(pfn, CBUF_MAPNP);
955ca3e8d8Dave Plauger		if (off != 0) {
956ca3e8d8Dave Plauger			end -= off;
957ca3e8d8Dave Plauger			continue;
958ca3e8d8Dave Plauger		}
959ca3e8d8Dave Plauger
960ca3e8d8Dave Plauger		if (!dump_range_check(bitnum, end, pfn))
961ca3e8d8Dave Plauger			continue;
962ca3e8d8Dave Plauger
963ca3e8d8Dave Plauger		ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
964ca3e8d8Dave Plauger		hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
965ca3e8d8Dave Plauger		    PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
966ca3e8d8Dave Plauger		sz += CBUF_MAPSIZE;
967ca3e8d8Dave Plauger		cfg->found4m++;
968ca3e8d8Dave Plauger
969ca3e8d8Dave Plauger		/* set the bitmap for both ends to be sure to cover the range */
970ca3e8d8Dave Plauger		dump_set_used(pfn);
971ca3e8d8Dave Plauger		dump_set_used(pfn + CBUF_MAPNP - 1);
972ca3e8d8Dave Plauger
973ca3e8d8Dave Plauger		if (sz >= cfg->maxsize)
974ca3e8d8Dave Plauger			goto foundmax;
975ca3e8d8Dave Plauger	}
976ca3e8d8Dave Plauger
977ca3e8d8Dave Plauger	/* Add small pages if we can't find enough large pages. */
978ca3e8d8Dave Plauger	dump_init_memlist_walker(&mlw);
979ca3e8d8Dave Plauger	for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
980ca3e8d8Dave Plauger		dump_timeleft = dump_timeout;
981ca3e8d8Dave Plauger		end = bitnum + CBUF_MAPNP;
982ca3e8d8Dave Plauger		pfn = dump_bitnum_to_pfn(bitnum, &mlw);
983ca3e8d8Dave Plauger		ASSERT(pfn != PFN_INVALID);
984ca3e8d8Dave Plauger
985ca3e8d8Dave Plauger		/* Find any non-aligned pages at start and end of segment. */
986ca3e8d8Dave Plauger		off = P2PHASE(pfn, CBUF_MAPNP);
987ca3e8d8Dave Plauger		if (mlw.mpleft < CBUF_MAPNP) {
988ca3e8d8Dave Plauger			end = bitnum + mlw.mpleft;
989ca3e8d8Dave Plauger		} else if (off != 0) {
990ca3e8d8Dave Plauger			end -= off;
991ca3e8d8Dave Plauger		} else if (cfg->found4m && dump_test_used(pfn)) {
992ca3e8d8Dave Plauger			continue;
993ca3e8d8Dave Plauger		}
994ca3e8d8Dave Plauger
995ca3e8d8Dave Plauger		for (; bitnum < end; bitnum++, pfn++) {
996ca3e8d8Dave Plauger			dump_timeleft = dump_timeout;
997ca3e8d8Dave Plauger			if (BT_TEST(dumpcfg.bitmap, bitnum))
998ca3e8d8Dave Plauger				continue;
999ca3e8d8Dave Plauger			if (!dump_pfn_check(pfn))