1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/mutex.h>
39#include <sys/bio.h>
40#include <sys/sbuf.h>
41#include <sys/sysctl.h>
42#include <sys/malloc.h>
43#include <sys/eventhandler.h>
44#include <vm/uma.h>
45#include <geom/geom.h>
46#include <geom/geom_dbg.h>
47#include <sys/proc.h>
48#include <sys/kthread.h>
49#include <sys/sched.h>
50#include <geom/raid/g_raid.h>
51#include "g_raid_md_if.h"
52#include "g_raid_tr_if.h"
53
54static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
55
56SYSCTL_DECL(_kern_geom);
57SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
58    "GEOM_RAID stuff");
59int g_raid_enable = 1;
60SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RWTUN,
61    &g_raid_enable, 0, "Enable on-disk metadata taste");
62u_int g_raid_aggressive_spare = 0;
63SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RWTUN,
64    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
65u_int g_raid_debug = 0;
66SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid_debug, 0,
67    "Debug level");
68int g_raid_read_err_thresh = 10;
69SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RWTUN,
70    &g_raid_read_err_thresh, 0,
71    "Number of read errors equated to disk failure");
72u_int g_raid_start_timeout = 30;
73SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RWTUN,
74    &g_raid_start_timeout, 0,
75    "Time to wait for all array components");
76static u_int g_raid_clean_time = 5;
77SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RWTUN,
78    &g_raid_clean_time, 0, "Mark volume as clean when idling");
79static u_int g_raid_disconnect_on_failure = 1;
80SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
81    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
82static u_int g_raid_name_format = 0;
83SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RWTUN,
84    &g_raid_name_format, 0, "Providers name format.");
85static u_int g_raid_idle_threshold = 1000000;
86SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RWTUN,
87    &g_raid_idle_threshold, 1000000,
88    "Time in microseconds to consider a volume idle.");
89
90#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
91	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
92	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
93	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
94} while (0)
95
96LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
97    LIST_HEAD_INITIALIZER(g_raid_md_classes);
98
99LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
100    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
101
102LIST_HEAD(, g_raid_volume) g_raid_volumes =
103    LIST_HEAD_INITIALIZER(g_raid_volumes);
104
105static eventhandler_tag g_raid_post_sync = NULL;
106static int g_raid_started = 0;
107static int g_raid_shutdown = 0;
108
109static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
110    struct g_geom *gp);
111static g_taste_t g_raid_taste;
112static void g_raid_init(struct g_class *mp);
113static void g_raid_fini(struct g_class *mp);
114
115struct g_class g_raid_class = {
116	.name = G_RAID_CLASS_NAME,
117	.version = G_VERSION,
118	.ctlreq = g_raid_ctl,
119	.taste = g_raid_taste,
120	.destroy_geom = g_raid_destroy_geom,
121	.init = g_raid_init,
122	.fini = g_raid_fini
123};
124
125static void g_raid_destroy_provider(struct g_raid_volume *vol);
126static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
127static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
128static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
129static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
130static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
131    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
132static void g_raid_start(struct bio *bp);
133static void g_raid_start_request(struct bio *bp);
134static void g_raid_disk_done(struct bio *bp);
135static void g_raid_poll(struct g_raid_softc *sc);
136
137static const char *
138g_raid_node_event2str(int event)
139{
140
141	switch (event) {
142	case G_RAID_NODE_E_WAKE:
143		return ("WAKE");
144	case G_RAID_NODE_E_START:
145		return ("START");
146	default:
147		return ("INVALID");
148	}
149}
150
151const char *
152g_raid_disk_state2str(int state)
153{
154
155	switch (state) {
156	case G_RAID_DISK_S_NONE:
157		return ("NONE");
158	case G_RAID_DISK_S_OFFLINE:
159		return ("OFFLINE");
160	case G_RAID_DISK_S_DISABLED:
161		return ("DISABLED");
162	case G_RAID_DISK_S_FAILED:
163		return ("FAILED");
164	case G_RAID_DISK_S_STALE_FAILED:
165		return ("STALE_FAILED");
166	case G_RAID_DISK_S_SPARE:
167		return ("SPARE");
168	case G_RAID_DISK_S_STALE:
169		return ("STALE");
170	case G_RAID_DISK_S_ACTIVE:
171		return ("ACTIVE");
172	default:
173		return ("INVALID");
174	}
175}
176
177static const char *
178g_raid_disk_event2str(int event)
179{
180
181	switch (event) {
182	case G_RAID_DISK_E_DISCONNECTED:
183		return ("DISCONNECTED");
184	default:
185		return ("INVALID");
186	}
187}
188
189const char *
190g_raid_subdisk_state2str(int state)
191{
192
193	switch (state) {
194	case G_RAID_SUBDISK_S_NONE:
195		return ("NONE");
196	case G_RAID_SUBDISK_S_FAILED:
197		return ("FAILED");
198	case G_RAID_SUBDISK_S_NEW:
199		return ("NEW");
200	case G_RAID_SUBDISK_S_REBUILD:
201		return ("REBUILD");
202	case G_RAID_SUBDISK_S_UNINITIALIZED:
203		return ("UNINITIALIZED");
204	case G_RAID_SUBDISK_S_STALE:
205		return ("STALE");
206	case G_RAID_SUBDISK_S_RESYNC:
207		return ("RESYNC");
208	case G_RAID_SUBDISK_S_ACTIVE:
209		return ("ACTIVE");
210	default:
211		return ("INVALID");
212	}
213}
214
215static const char *
216g_raid_subdisk_event2str(int event)
217{
218
219	switch (event) {
220	case G_RAID_SUBDISK_E_NEW:
221		return ("NEW");
222	case G_RAID_SUBDISK_E_FAILED:
223		return ("FAILED");
224	case G_RAID_SUBDISK_E_DISCONNECTED:
225		return ("DISCONNECTED");
226	default:
227		return ("INVALID");
228	}
229}
230
231const char *
232g_raid_volume_state2str(int state)
233{
234
235	switch (state) {
236	case G_RAID_VOLUME_S_STARTING:
237		return ("STARTING");
238	case G_RAID_VOLUME_S_BROKEN:
239		return ("BROKEN");
240	case G_RAID_VOLUME_S_DEGRADED:
241		return ("DEGRADED");
242	case G_RAID_VOLUME_S_SUBOPTIMAL:
243		return ("SUBOPTIMAL");
244	case G_RAID_VOLUME_S_OPTIMAL:
245		return ("OPTIMAL");
246	case G_RAID_VOLUME_S_UNSUPPORTED:
247		return ("UNSUPPORTED");
248	case G_RAID_VOLUME_S_STOPPED:
249		return ("STOPPED");
250	default:
251		return ("INVALID");
252	}
253}
254
255static const char *
256g_raid_volume_event2str(int event)
257{
258
259	switch (event) {
260	case G_RAID_VOLUME_E_UP:
261		return ("UP");
262	case G_RAID_VOLUME_E_DOWN:
263		return ("DOWN");
264	case G_RAID_VOLUME_E_START:
265		return ("START");
266	case G_RAID_VOLUME_E_STARTMD:
267		return ("STARTMD");
268	default:
269		return ("INVALID");
270	}
271}
272
273const char *
274g_raid_volume_level2str(int level, int qual)
275{
276
277	switch (level) {
278	case G_RAID_VOLUME_RL_RAID0:
279		return ("RAID0");
280	case G_RAID_VOLUME_RL_RAID1:
281		return ("RAID1");
282	case G_RAID_VOLUME_RL_RAID3:
283		if (qual == G_RAID_VOLUME_RLQ_R3P0)
284			return ("RAID3-P0");
285		if (qual == G_RAID_VOLUME_RLQ_R3PN)
286			return ("RAID3-PN");
287		return ("RAID3");
288	case G_RAID_VOLUME_RL_RAID4:
289		if (qual == G_RAID_VOLUME_RLQ_R4P0)
290			return ("RAID4-P0");
291		if (qual == G_RAID_VOLUME_RLQ_R4PN)
292			return ("RAID4-PN");
293		return ("RAID4");
294	case G_RAID_VOLUME_RL_RAID5:
295		if (qual == G_RAID_VOLUME_RLQ_R5RA)
296			return ("RAID5-RA");
297		if (qual == G_RAID_VOLUME_RLQ_R5RS)
298			return ("RAID5-RS");
299		if (qual == G_RAID_VOLUME_RLQ_R5LA)
300			return ("RAID5-LA");
301		if (qual == G_RAID_VOLUME_RLQ_R5LS)
302			return ("RAID5-LS");
303		return ("RAID5");
304	case G_RAID_VOLUME_RL_RAID6:
305		if (qual == G_RAID_VOLUME_RLQ_R6RA)
306			return ("RAID6-RA");
307		if (qual == G_RAID_VOLUME_RLQ_R6RS)
308			return ("RAID6-RS");
309		if (qual == G_RAID_VOLUME_RLQ_R6LA)
310			return ("RAID6-LA");
311		if (qual == G_RAID_VOLUME_RLQ_R6LS)
312			return ("RAID6-LS");
313		return ("RAID6");
314	case G_RAID_VOLUME_RL_RAIDMDF:
315		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
316			return ("RAIDMDF-RA");
317		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
318			return ("RAIDMDF-RS");
319		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
320			return ("RAIDMDF-LA");
321		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
322			return ("RAIDMDF-LS");
323		return ("RAIDMDF");
324	case G_RAID_VOLUME_RL_RAID1E:
325		if (qual == G_RAID_VOLUME_RLQ_R1EA)
326			return ("RAID1E-A");
327		if (qual == G_RAID_VOLUME_RLQ_R1EO)
328			return ("RAID1E-O");
329		return ("RAID1E");
330	case G_RAID_VOLUME_RL_SINGLE:
331		return ("SINGLE");
332	case G_RAID_VOLUME_RL_CONCAT:
333		return ("CONCAT");
334	case G_RAID_VOLUME_RL_RAID5E:
335		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
336			return ("RAID5E-RA");
337		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
338			return ("RAID5E-RS");
339		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
340			return ("RAID5E-LA");
341		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
342			return ("RAID5E-LS");
343		return ("RAID5E");
344	case G_RAID_VOLUME_RL_RAID5EE:
345		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
346			return ("RAID5EE-RA");
347		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
348			return ("RAID5EE-RS");
349		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
350			return ("RAID5EE-LA");
351		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
352			return ("RAID5EE-LS");
353		return ("RAID5EE");
354	case G_RAID_VOLUME_RL_RAID5R:
355		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
356			return ("RAID5R-RA");
357		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
358			return ("RAID5R-RS");
359		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
360			return ("RAID5R-LA");
361		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
362			return ("RAID5R-LS");
363		return ("RAID5E");
364	default:
365		return ("UNKNOWN");
366	}
367}
368
369int
370g_raid_volume_str2level(const char *str, int *level, int *qual)
371{
372
373	*level = G_RAID_VOLUME_RL_UNKNOWN;
374	*qual = G_RAID_VOLUME_RLQ_NONE;
375	if (strcasecmp(str, "RAID0") == 0)
376		*level = G_RAID_VOLUME_RL_RAID0;
377	else if (strcasecmp(str, "RAID1") == 0)
378		*level = G_RAID_VOLUME_RL_RAID1;
379	else if (strcasecmp(str, "RAID3-P0") == 0) {
380		*level = G_RAID_VOLUME_RL_RAID3;
381		*qual = G_RAID_VOLUME_RLQ_R3P0;
382	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
383		   strcasecmp(str, "RAID3") == 0) {
384		*level = G_RAID_VOLUME_RL_RAID3;
385		*qual = G_RAID_VOLUME_RLQ_R3PN;
386	} else if (strcasecmp(str, "RAID4-P0") == 0) {
387		*level = G_RAID_VOLUME_RL_RAID4;
388		*qual = G_RAID_VOLUME_RLQ_R4P0;
389	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
390		   strcasecmp(str, "RAID4") == 0) {
391		*level = G_RAID_VOLUME_RL_RAID4;
392		*qual = G_RAID_VOLUME_RLQ_R4PN;
393	} else if (strcasecmp(str, "RAID5-RA") == 0) {
394		*level = G_RAID_VOLUME_RL_RAID5;
395		*qual = G_RAID_VOLUME_RLQ_R5RA;
396	} else if (strcasecmp(str, "RAID5-RS") == 0) {
397		*level = G_RAID_VOLUME_RL_RAID5;
398		*qual = G_RAID_VOLUME_RLQ_R5RS;
399	} else if (strcasecmp(str, "RAID5") == 0 ||
400		   strcasecmp(str, "RAID5-LA") == 0) {
401		*level = G_RAID_VOLUME_RL_RAID5;
402		*qual = G_RAID_VOLUME_RLQ_R5LA;
403	} else if (strcasecmp(str, "RAID5-LS") == 0) {
404		*level = G_RAID_VOLUME_RL_RAID5;
405		*qual = G_RAID_VOLUME_RLQ_R5LS;
406	} else if (strcasecmp(str, "RAID6-RA") == 0) {
407		*level = G_RAID_VOLUME_RL_RAID6;
408		*qual = G_RAID_VOLUME_RLQ_R6RA;
409	} else if (strcasecmp(str, "RAID6-RS") == 0) {
410		*level = G_RAID_VOLUME_RL_RAID6;
411		*qual = G_RAID_VOLUME_RLQ_R6RS;
412	} else if (strcasecmp(str, "RAID6") == 0 ||
413		   strcasecmp(str, "RAID6-LA") == 0) {
414		*level = G_RAID_VOLUME_RL_RAID6;
415		*qual = G_RAID_VOLUME_RLQ_R6LA;
416	} else if (strcasecmp(str, "RAID6-LS") == 0) {
417		*level = G_RAID_VOLUME_RL_RAID6;
418		*qual = G_RAID_VOLUME_RLQ_R6LS;
419	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
420		*level = G_RAID_VOLUME_RL_RAIDMDF;
421		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
422	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
423		*level = G_RAID_VOLUME_RL_RAIDMDF;
424		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
425	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
426		   strcasecmp(str, "RAIDMDF-LA") == 0) {
427		*level = G_RAID_VOLUME_RL_RAIDMDF;
428		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
429	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
430		*level = G_RAID_VOLUME_RL_RAIDMDF;
431		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
432	} else if (strcasecmp(str, "RAID10") == 0 ||
433		   strcasecmp(str, "RAID1E") == 0 ||
434		   strcasecmp(str, "RAID1E-A") == 0) {
435		*level = G_RAID_VOLUME_RL_RAID1E;
436		*qual = G_RAID_VOLUME_RLQ_R1EA;
437	} else if (strcasecmp(str, "RAID1E-O") == 0) {
438		*level = G_RAID_VOLUME_RL_RAID1E;
439		*qual = G_RAID_VOLUME_RLQ_R1EO;
440	} else if (strcasecmp(str, "SINGLE") == 0)
441		*level = G_RAID_VOLUME_RL_SINGLE;
442	else if (strcasecmp(str, "CONCAT") == 0)
443		*level = G_RAID_VOLUME_RL_CONCAT;
444	else if (strcasecmp(str, "RAID5E-RA") == 0) {
445		*level = G_RAID_VOLUME_RL_RAID5E;
446		*qual = G_RAID_VOLUME_RLQ_R5ERA;
447	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
448		*level = G_RAID_VOLUME_RL_RAID5E;
449		*qual = G_RAID_VOLUME_RLQ_R5ERS;
450	} else if (strcasecmp(str, "RAID5E") == 0 ||
451		   strcasecmp(str, "RAID5E-LA") == 0) {
452		*level = G_RAID_VOLUME_RL_RAID5E;
453		*qual = G_RAID_VOLUME_RLQ_R5ELA;
454	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
455		*level = G_RAID_VOLUME_RL_RAID5E;
456		*qual = G_RAID_VOLUME_RLQ_R5ELS;
457	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
458		*level = G_RAID_VOLUME_RL_RAID5EE;
459		*qual = G_RAID_VOLUME_RLQ_R5EERA;
460	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
461		*level = G_RAID_VOLUME_RL_RAID5EE;
462		*qual = G_RAID_VOLUME_RLQ_R5EERS;
463	} else if (strcasecmp(str, "RAID5EE") == 0 ||
464		   strcasecmp(str, "RAID5EE-LA") == 0) {
465		*level = G_RAID_VOLUME_RL_RAID5EE;
466		*qual = G_RAID_VOLUME_RLQ_R5EELA;
467	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
468		*level = G_RAID_VOLUME_RL_RAID5EE;
469		*qual = G_RAID_VOLUME_RLQ_R5EELS;
470	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
471		*level = G_RAID_VOLUME_RL_RAID5R;
472		*qual = G_RAID_VOLUME_RLQ_R5RRA;
473	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
474		*level = G_RAID_VOLUME_RL_RAID5R;
475		*qual = G_RAID_VOLUME_RLQ_R5RRS;
476	} else if (strcasecmp(str, "RAID5R") == 0 ||
477		   strcasecmp(str, "RAID5R-LA") == 0) {
478		*level = G_RAID_VOLUME_RL_RAID5R;
479		*qual = G_RAID_VOLUME_RLQ_R5RLA;
480	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
481		*level = G_RAID_VOLUME_RL_RAID5R;
482		*qual = G_RAID_VOLUME_RLQ_R5RLS;
483	} else
484		return (-1);
485	return (0);
486}
487
488const char *
489g_raid_get_diskname(struct g_raid_disk *disk)
490{
491
492	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
493		return ("[unknown]");
494	return (disk->d_consumer->provider->name);
495}
496
497void
498g_raid_get_disk_info(struct g_raid_disk *disk)
499{
500	struct g_consumer *cp = disk->d_consumer;
501	int error, len;
502
503	/* Read kernel dumping information. */
504	disk->d_kd.offset = 0;
505	disk->d_kd.length = OFF_MAX;
506	len = sizeof(disk->d_kd);
507	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
508	if (error)
509		disk->d_kd.di.dumper = NULL;
510	if (disk->d_kd.di.dumper == NULL)
511		G_RAID_DEBUG1(2, disk->d_softc,
512		    "Dumping not supported by %s: %d.",
513		    cp->provider->name, error);
514
515	/* Read BIO_DELETE support. */
516	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
517	if (error)
518		disk->d_candelete = 0;
519	if (!disk->d_candelete)
520		G_RAID_DEBUG1(2, disk->d_softc,
521		    "BIO_DELETE not supported by %s: %d.",
522		    cp->provider->name, error);
523}
524
525void
526g_raid_report_disk_state(struct g_raid_disk *disk)
527{
528	struct g_raid_subdisk *sd;
529	int len, state;
530	uint32_t s;
531
532	if (disk->d_consumer == NULL)
533		return;
534	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
535		s = G_STATE_ACTIVE; /* XXX */
536	} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
537	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
538		s = G_STATE_FAILED;
539	} else {
540		state = G_RAID_SUBDISK_S_ACTIVE;
541		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
542			if (sd->sd_state < state)
543				state = sd->sd_state;
544		}
545		if (state == G_RAID_SUBDISK_S_FAILED)
546			s = G_STATE_FAILED;
547		else if (state == G_RAID_SUBDISK_S_NEW ||
548		    state == G_RAID_SUBDISK_S_REBUILD)
549			s = G_STATE_REBUILD;
550		else if (state == G_RAID_SUBDISK_S_STALE ||
551		    state == G_RAID_SUBDISK_S_RESYNC)
552			s = G_STATE_RESYNC;
553		else
554			s = G_STATE_ACTIVE;
555	}
556	len = sizeof(s);
557	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
558	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
559	    g_raid_get_diskname(disk), s);
560}
561
562void
563g_raid_change_disk_state(struct g_raid_disk *disk, int state)
564{
565
566	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
567	    g_raid_get_diskname(disk),
568	    g_raid_disk_state2str(disk->d_state),
569	    g_raid_disk_state2str(state));
570	disk->d_state = state;
571	g_raid_report_disk_state(disk);
572}
573
574void
575g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
576{
577
578	G_RAID_DEBUG1(0, sd->sd_softc,
579	    "Subdisk %s:%d-%s state changed from %s to %s.",
580	    sd->sd_volume->v_name, sd->sd_pos,
581	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
582	    g_raid_subdisk_state2str(sd->sd_state),
583	    g_raid_subdisk_state2str(state));
584	sd->sd_state = state;
585	if (sd->sd_disk)
586		g_raid_report_disk_state(sd->sd_disk);
587}
588
589void
590g_raid_change_volume_state(struct g_raid_volume *vol, int state)
591{
592
593	G_RAID_DEBUG1(0, vol->v_softc,
594	    "Volume %s state changed from %s to %s.",
595	    vol->v_name,
596	    g_raid_volume_state2str(vol->v_state),
597	    g_raid_volume_state2str(state));
598	vol->v_state = state;
599}
600
601/*
602 * --- Events handling functions ---
603 * Events in geom_raid are used to maintain subdisks and volumes status
604 * from one thread to simplify locking.
605 */
606static void
607g_raid_event_free(struct g_raid_event *ep)
608{
609
610	free(ep, M_RAID);
611}
612
613int
614g_raid_event_send(void *arg, int event, int flags)
615{
616	struct g_raid_softc *sc;
617	struct g_raid_event *ep;
618	int error;
619
620	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
621		sc = ((struct g_raid_volume *)arg)->v_softc;
622	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
623		sc = ((struct g_raid_disk *)arg)->d_softc;
624	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
625		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
626	} else {
627		sc = arg;
628	}
629	ep = malloc(sizeof(*ep), M_RAID,
630	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
631	if (ep == NULL)
632		return (ENOMEM);
633	ep->e_tgt = arg;
634	ep->e_event = event;
635	ep->e_flags = flags;
636	ep->e_error = 0;
637	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
638	mtx_lock(&sc->sc_queue_mtx);
639	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
640	mtx_unlock(&sc->sc_queue_mtx);
641	wakeup(sc);
642
643	if ((flags & G_RAID_EVENT_WAIT) == 0)
644		return (0);
645
646	sx_assert(&sc->sc_lock, SX_XLOCKED);
647	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
648	sx_xunlock(&sc->sc_lock);
649	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
650		mtx_lock(&sc->sc_queue_mtx);
651		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
652		    hz * 5);
653	}
654	error = ep->e_error;
655	g_raid_event_free(ep);
656	sx_xlock(&sc->sc_lock);
657	return (error);
658}
659
660static void
661g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
662{
663	struct g_raid_event *ep, *tmpep;
664
665	sx_assert(&sc->sc_lock, SX_XLOCKED);
666
667	mtx_lock(&sc->sc_queue_mtx);
668	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
669		if (ep->e_tgt != tgt)
670			continue;
671		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
672		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
673			g_raid_event_free(ep);
674		else {
675			ep->e_error = ECANCELED;
676			wakeup(ep);
677		}
678	}
679	mtx_unlock(&sc->sc_queue_mtx);
680}
681
682static int
683g_raid_event_check(struct g_raid_softc *sc, void *tgt)
684{
685	struct g_raid_event *ep;
686	int	res = 0;
687
688	sx_assert(&sc->sc_lock, SX_XLOCKED);
689
690	mtx_lock(&sc->sc_queue_mtx);
691	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
692		if (ep->e_tgt != tgt)
693			continue;
694		res = 1;
695		break;
696	}
697	mtx_unlock(&sc->sc_queue_mtx);
698	return (res);
699}
700
701/*
702 * Return the number of disks in given state.
703 * If state is equal to -1, count all connected disks.
704 */
705u_int
706g_raid_ndisks(struct g_raid_softc *sc, int state)
707{
708	struct g_raid_disk *disk;
709	u_int n;
710
711	sx_assert(&sc->sc_lock, SX_LOCKED);
712
713	n = 0;
714	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
715		if (disk->d_state == state || state == -1)
716			n++;
717	}
718	return (n);
719}
720
721/*
722 * Return the number of subdisks in given state.
723 * If state is equal to -1, count all connected disks.
724 */
725u_int
726g_raid_nsubdisks(struct g_raid_volume *vol, int state)
727{
728	struct g_raid_subdisk *subdisk;
729	struct g_raid_softc *sc;
730	u_int i, n ;
731
732	sc = vol->v_softc;
733	sx_assert(&sc->sc_lock, SX_LOCKED);
734
735	n = 0;
736	for (i = 0; i < vol->v_disks_count; i++) {
737		subdisk = &vol->v_subdisks[i];
738		if ((state == -1 &&
739		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
740		    subdisk->sd_state == state)
741			n++;
742	}
743	return (n);
744}
745
746/*
747 * Return the first subdisk in given state.
748 * If state is equal to -1, then the first connected disks.
749 */
750struct g_raid_subdisk *
751g_raid_get_subdisk(struct g_raid_volume *vol, int state)
752{
753	struct g_raid_subdisk *sd;
754	struct g_raid_softc *sc;
755	u_int i;
756
757	sc = vol->v_softc;
758	sx_assert(&sc->sc_lock, SX_LOCKED);
759
760	for (i = 0; i < vol->v_disks_count; i++) {
761		sd = &vol->v_subdisks[i];
762		if ((state == -1 &&
763		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
764		    sd->sd_state == state)
765			return (sd);
766	}
767	return (NULL);
768}
769
770struct g_consumer *
771g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
772{
773	struct g_consumer *cp;
774	struct g_provider *pp;
775
776	g_topology_assert();
777
778	if (strncmp(name, _PATH_DEV, 5) == 0)
779		name += 5;
780	pp = g_provider_by_name(name);
781	if (pp == NULL)
782		return (NULL);
783	cp = g_new_consumer(sc->sc_geom);
784	cp->flags |= G_CF_DIRECT_RECEIVE;
785	if (g_attach(cp, pp) != 0) {
786		g_destroy_consumer(cp);
787		return (NULL);
788	}
789	if (g_access(cp, 1, 1, 1) != 0) {
790		g_detach(cp);
791		g_destroy_consumer(cp);
792		return (NULL);
793	}
794	return (cp);
795}
796
797static u_int
798g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
799{
800	struct bio *bp;
801	u_int nreqs = 0;
802
803	mtx_lock(&sc->sc_queue_mtx);
804	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
805		if (bp->bio_from == cp)
806			nreqs++;
807	}
808	mtx_unlock(&sc->sc_queue_mtx);
809	return (nreqs);
810}
811
812u_int
813g_raid_nopens(struct g_raid_softc *sc)
814{
815	struct g_raid_volume *vol;
816	u_int opens;
817
818	opens = 0;
819	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
820		if (vol->v_provider_open != 0)
821			opens++;
822	}
823	return (opens);
824}
825
826static int
827g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
828{
829
830	if (cp->index > 0) {
831		G_RAID_DEBUG1(2, sc,
832		    "I/O requests for %s exist, can't destroy it now.",
833		    cp->provider->name);
834		return (1);
835	}
836	if (g_raid_nrequests(sc, cp) > 0) {
837		G_RAID_DEBUG1(2, sc,
838		    "I/O requests for %s in queue, can't destroy it now.",
839		    cp->provider->name);
840		return (1);
841	}
842	return (0);
843}
844
845static void
846g_raid_destroy_consumer(void *arg, int flags __unused)
847{
848	struct g_consumer *cp;
849
850	g_topology_assert();
851
852	cp = arg;
853	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
854	g_detach(cp);
855	g_destroy_consumer(cp);
856}
857
858void
859g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
860{
861	struct g_provider *pp;
862	int retaste_wait;
863
864	g_topology_assert_not();
865
866	g_topology_lock();
867	cp->private = NULL;
868	if (g_raid_consumer_is_busy(sc, cp))
869		goto out;
870	pp = cp->provider;
871	retaste_wait = 0;
872	if (cp->acw == 1) {
873		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
874			retaste_wait = 1;
875	}
876	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
877		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
878	if (retaste_wait) {
879		/*
880		 * After retaste event was send (inside g_access()), we can send
881		 * event to detach and destroy consumer.
882		 * A class, which has consumer to the given provider connected
883		 * will not receive retaste event for the provider.
884		 * This is the way how I ignore retaste events when I close
885		 * consumers opened for write: I detach and destroy consumer
886		 * after retaste event is sent.
887		 */
888		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
889		goto out;
890	}
891	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
892	g_detach(cp);
893	g_destroy_consumer(cp);
894out:
895	g_topology_unlock();
896}
897
898static void
899g_raid_orphan(struct g_consumer *cp)
900{
901	struct g_raid_disk *disk;
902
903	g_topology_assert();
904
905	disk = cp->private;
906	if (disk == NULL)
907		return;
908	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
909	    G_RAID_EVENT_DISK);
910}
911
912static void
913g_raid_clean(struct g_raid_volume *vol, int acw)
914{
915	struct g_raid_softc *sc;
916	int timeout;
917
918	sc = vol->v_softc;
919	g_topology_assert_not();
920	sx_assert(&sc->sc_lock, SX_XLOCKED);
921
922//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
923//		return;
924	if (!vol->v_dirty)
925		return;
926	if (vol->v_writes > 0)
927		return;
928	if (acw > 0 || (acw == -1 &&
929	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
930		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
931		if (!g_raid_shutdown && timeout > 0)
932			return;
933	}
934	vol->v_dirty = 0;
935	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
936	    vol->v_name);
937	g_raid_write_metadata(sc, vol, NULL, NULL);
938}
939
940static void
941g_raid_dirty(struct g_raid_volume *vol)
942{
943	struct g_raid_softc *sc;
944
945	sc = vol->v_softc;
946	g_topology_assert_not();
947	sx_assert(&sc->sc_lock, SX_XLOCKED);
948
949//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
950//		return;
951	vol->v_dirty = 1;
952	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
953	    vol->v_name);
954	g_raid_write_metadata(sc, vol, NULL, NULL);
955}
956
957void
958g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
959{
960	struct g_raid_volume *vol;
961	struct g_raid_subdisk *sd;
962	struct bio_queue_head queue;
963	struct bio *cbp;
964	int i;
965
966	vol = tr->tro_volume;
967
968	/*
969	 * Allocate all bios before sending any request, so we can return
970	 * ENOMEM in nice and clean way.
971	 */
972	bioq_init(&queue);
973	for (i = 0; i < vol->v_disks_count; i++) {
974		sd = &vol->v_subdisks[i];
975		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
976		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
977			continue;
978		cbp = g_clone_bio(bp);
979		if (cbp == NULL)
980			goto failure;
981		cbp->bio_caller1 = sd;
982		bioq_insert_tail(&queue, cbp);
983	}
984	while ((cbp = bioq_takefirst(&queue)) != NULL) {
985		sd = cbp->bio_caller1;
986		cbp->bio_caller1 = NULL;
987		g_raid_subdisk_iostart(sd, cbp);
988	}
989	return;
990failure:
991	while ((cbp = bioq_takefirst(&queue)) != NULL)
992		g_destroy_bio(cbp);
993	if (bp->bio_error == 0)
994		bp->bio_error = ENOMEM;
995	g_raid_iodone(bp, bp->bio_error);
996}
997
998static void
999g_raid_tr_kerneldump_common_done(struct bio *bp)
1000{
1001
1002	bp->bio_flags |= BIO_DONE;
1003}
1004
1005int
1006g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
1007    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1008{
1009	struct g_raid_softc *sc;
1010	struct g_raid_volume *vol;
1011	struct bio bp;
1012
1013	vol = tr->tro_volume;
1014	sc = vol->v_softc;
1015
1016	g_reset_bio(&bp);
1017	bp.bio_cmd = BIO_WRITE;
1018	bp.bio_done = g_raid_tr_kerneldump_common_done;
1019	bp.bio_attribute = NULL;
1020	bp.bio_offset = offset;
1021	bp.bio_length = length;
1022	bp.bio_data = virtual;
1023	bp.bio_to = vol->v_provider;
1024
1025	g_raid_start(&bp);
1026	while (!(bp.bio_flags & BIO_DONE)) {
1027		G_RAID_DEBUG1(4, sc, "Poll...");
1028		g_raid_poll(sc);
1029		DELAY(10);
1030	}
1031
1032	return (bp.bio_error != 0 ? EIO : 0);
1033}
1034
1035static int
1036g_raid_dump(void *arg,
1037    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1038{
1039	struct g_raid_volume *vol;
1040	int error;
1041
1042	vol = (struct g_raid_volume *)arg;
1043	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1044	    (long long unsigned)offset, (long long unsigned)length);
1045
1046	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1047	    virtual, physical, offset, length);
1048	return (error);
1049}
1050
1051static void
1052g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1053{
1054	struct g_kerneldump *gkd;
1055	struct g_provider *pp;
1056	struct g_raid_volume *vol;
1057
1058	gkd = (struct g_kerneldump*)bp->bio_data;
1059	pp = bp->bio_to;
1060	vol = pp->private;
1061	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1062		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1063	gkd->di.dumper = g_raid_dump;
1064	gkd->di.priv = vol;
1065	gkd->di.blocksize = vol->v_sectorsize;
1066	gkd->di.maxiosize = DFLTPHYS;
1067	gkd->di.mediaoffset = gkd->offset;
1068	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1069		gkd->length = vol->v_mediasize - gkd->offset;
1070	gkd->di.mediasize = gkd->length;
1071	g_io_deliver(bp, 0);
1072}
1073
1074static void
1075g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
1076{
1077	struct g_provider *pp;
1078	struct g_raid_volume *vol;
1079	struct g_raid_subdisk *sd;
1080	int i, val;
1081
1082	pp = bp->bio_to;
1083	vol = pp->private;
1084	for (i = 0; i < vol->v_disks_count; i++) {
1085		sd = &vol->v_subdisks[i];
1086		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1087			continue;
1088		if (sd->sd_disk->d_candelete)
1089			break;
1090	}
1091	val = i < vol->v_disks_count;
1092	g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
1093}
1094
1095static void
1096g_raid_start(struct bio *bp)
1097{
1098	struct g_raid_softc *sc;
1099
1100	sc = bp->bio_to->geom->softc;
1101	/*
1102	 * If sc == NULL or there are no valid disks, provider's error
1103	 * should be set and g_raid_start() should not be called at all.
1104	 */
1105//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1106//	    ("Provider's error should be set (error=%d)(mirror=%s).",
1107//	    bp->bio_to->error, bp->bio_to->name));
1108	G_RAID_LOGREQ(3, bp, "Request received.");
1109
1110	switch (bp->bio_cmd) {
1111	case BIO_READ:
1112	case BIO_WRITE:
1113	case BIO_DELETE:
1114	case BIO_FLUSH:
1115	case BIO_SPEEDUP:
1116		break;
1117	case BIO_GETATTR:
1118		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
1119			g_raid_candelete(sc, bp);
1120		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1121			g_raid_kerneldump(sc, bp);
1122		else
1123			g_io_deliver(bp, EOPNOTSUPP);
1124		return;
1125	default:
1126		g_io_deliver(bp, EOPNOTSUPP);
1127		return;
1128	}
1129	mtx_lock(&sc->sc_queue_mtx);
1130	bioq_insert_tail(&sc->sc_queue, bp);
1131	mtx_unlock(&sc->sc_queue_mtx);
1132	if (!dumping) {
1133		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1134		wakeup(sc);
1135	}
1136}
1137
1138static int
1139g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1140{
1141	/*
1142	 * 5 cases:
1143	 * (1) bp entirely below NO
1144	 * (2) bp entirely above NO
1145	 * (3) bp start below, but end in range YES
1146	 * (4) bp entirely within YES
1147	 * (5) bp starts within, ends above YES
1148	 *
1149	 * lock range 10-19 (offset 10 length 10)
1150	 * (1) 1-5: first if kicks it out
1151	 * (2) 30-35: second if kicks it out
1152	 * (3) 5-15: passes both ifs
1153	 * (4) 12-14: passes both ifs
1154	 * (5) 19-20: passes both
1155	 */
1156	off_t lend = lstart + len - 1;
1157	off_t bstart = bp->bio_offset;
1158	off_t bend = bp->bio_offset + bp->bio_length - 1;
1159
1160	if (bend < lstart)
1161		return (0);
1162	if (lend < bstart)
1163		return (0);
1164	return (1);
1165}
1166
1167static int
1168g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1169{
1170	struct g_raid_lock *lp;
1171
1172	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1173
1174	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1175		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1176			return (1);
1177	}
1178	return (0);
1179}
1180
1181static void
1182g_raid_start_request(struct bio *bp)
1183{
1184	struct g_raid_softc *sc;
1185	struct g_raid_volume *vol;
1186
1187	sc = bp->bio_to->geom->softc;
1188	sx_assert(&sc->sc_lock, SX_LOCKED);
1189	vol = bp->bio_to->private;
1190
1191	/*
1192	 * Check to see if this item is in a locked range.  If so,
1193	 * queue it to our locked queue and return.  We'll requeue
1194	 * it when the range is unlocked.  Internal I/O for the
1195	 * rebuild/rescan/recovery process is excluded from this
1196	 * check so we can actually do the recovery.
1197	 */
1198	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1199	    g_raid_is_in_locked_range(vol, bp)) {
1200		G_RAID_LOGREQ(3, bp, "Defer request.");
1201		bioq_insert_tail(&vol->v_locked, bp);
1202		return;
1203	}
1204
1205	/*
1206	 * If we're actually going to do the write/delete, then
1207	 * update the idle stats for the volume.
1208	 */
1209	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1210		if (!vol->v_dirty)
1211			g_raid_dirty(vol);
1212		vol->v_writes++;
1213	}
1214
1215	/*
1216	 * Put request onto inflight queue, so we can check if new
1217	 * synchronization requests don't collide with it.  Then tell
1218	 * the transformation layer to start the I/O.
1219	 */
1220	bioq_insert_tail(&vol->v_inflight, bp);
1221	G_RAID_LOGREQ(4, bp, "Request started");
1222	G_RAID_TR_IOSTART(vol->v_tr, bp);
1223}
1224
1225static void
1226g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1227{
1228	off_t off, len;
1229	struct bio *nbp;
1230	struct g_raid_lock *lp;
1231
1232	vol->v_pending_lock = 0;
1233	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1234		if (lp->l_pending) {
1235			off = lp->l_offset;
1236			len = lp->l_length;
1237			lp->l_pending = 0;
1238			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1239				if (g_raid_bio_overlaps(nbp, off, len))
1240					lp->l_pending++;
1241			}
1242			if (lp->l_pending) {
1243				vol->v_pending_lock = 1;
1244				G_RAID_DEBUG1(4, vol->v_softc,
1245				    "Deferred lock(%jd, %jd) has %d pending",
1246				    (intmax_t)off, (intmax_t)(off + len),
1247				    lp->l_pending);
1248				continue;
1249			}
1250			G_RAID_DEBUG1(4, vol->v_softc,
1251			    "Deferred lock of %jd to %jd completed",
1252			    (intmax_t)off, (intmax_t)(off + len));
1253			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1254		}
1255	}
1256}
1257
1258void
1259g_raid_iodone(struct bio *bp, int error)
1260{
1261	struct g_raid_softc *sc;
1262	struct g_raid_volume *vol;
1263
1264	sc = bp->bio_to->geom->softc;
1265	sx_assert(&sc->sc_lock, SX_LOCKED);
1266	vol = bp->bio_to->private;
1267	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1268
1269	/* Update stats if we done write/delete. */
1270	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1271		vol->v_writes--;
1272		vol->v_last_write = time_uptime;
1273	}
1274
1275	bioq_remove(&vol->v_inflight, bp);
1276	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1277		g_raid_finish_with_locked_ranges(vol, bp);
1278	getmicrouptime(&vol->v_last_done);
1279	g_io_deliver(bp, error);
1280}
1281
1282int
1283g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1284    struct bio *ignore, void *argp)
1285{
1286	struct g_raid_softc *sc;
1287	struct g_raid_lock *lp;
1288	struct bio *bp;
1289
1290	sc = vol->v_softc;
1291	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1292	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1293	lp->l_offset = off;
1294	lp->l_length = len;
1295	lp->l_callback_arg = argp;
1296
1297	lp->l_pending = 0;
1298	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1299		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1300			lp->l_pending++;
1301	}
1302
1303	/*
1304	 * If there are any writes that are pending, we return EBUSY.  All
1305	 * callers will have to wait until all pending writes clear.
1306	 */
1307	if (lp->l_pending > 0) {
1308		vol->v_pending_lock = 1;
1309		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1310		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1311		return (EBUSY);
1312	}
1313	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1314	    (intmax_t)off, (intmax_t)(off+len));
1315	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1316	return (0);
1317}
1318
1319int
1320g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1321{
1322	struct g_raid_lock *lp;
1323	struct g_raid_softc *sc;
1324	struct bio *bp;
1325
1326	sc = vol->v_softc;
1327	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1328		if (lp->l_offset == off && lp->l_length == len) {
1329			LIST_REMOVE(lp, l_next);
1330			/* XXX
1331			 * Right now we just put them all back on the queue
1332			 * and hope for the best.  We hope this because any
1333			 * locked ranges will go right back on this list
1334			 * when the worker thread runs.
1335			 * XXX
1336			 */
1337			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1338			    (intmax_t)lp->l_offset,
1339			    (intmax_t)(lp->l_offset+lp->l_length));
1340			mtx_lock(&sc->sc_queue_mtx);
1341			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1342				bioq_insert_tail(&sc->sc_queue, bp);
1343			mtx_unlock(&sc->sc_queue_mtx);
1344			free(lp, M_RAID);
1345			return (0);
1346		}
1347	}
1348	return (EINVAL);
1349}
1350
1351void
1352g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1353{
1354	struct g_consumer *cp;
1355	struct g_raid_disk *disk, *tdisk;
1356
1357	bp->bio_caller1 = sd;
1358
1359	/*
1360	 * Make sure that the disk is present. Generally it is a task of
1361	 * transformation layers to not send requests to absent disks, but
1362	 * it is better to be safe and report situation then sorry.
1363	 */
1364	if (sd->sd_disk == NULL) {
1365		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1366nodisk:
1367		bp->bio_from = NULL;
1368		bp->bio_to = NULL;
1369		bp->bio_error = ENXIO;
1370		g_raid_disk_done(bp);
1371		return;
1372	}
1373	disk = sd->sd_disk;
1374	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1375	    disk->d_state != G_RAID_DISK_S_FAILED) {
1376		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1377		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1378		goto nodisk;
1379	}
1380
1381	cp = disk->d_consumer;
1382	bp->bio_from = cp;
1383	bp->bio_to = cp->provider;
1384	cp->index++;
1385
1386	/* Update average disks load. */
1387	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1388		if (tdisk->d_consumer == NULL)
1389			tdisk->d_load = 0;
1390		else
1391			tdisk->d_load = (tdisk->d_consumer->index *
1392			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1393	}
1394
1395	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1396	if (dumping) {
1397		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1398		if (bp->bio_cmd == BIO_WRITE) {
1399			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1400			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1401		} else
1402			bp->bio_error = EOPNOTSUPP;
1403		g_raid_disk_done(bp);
1404	} else {
1405		bp->bio_done = g_raid_disk_done;
1406		bp->bio_offset += sd->sd_offset;
1407		G_RAID_LOGREQ(3, bp, "Sending request.");
1408		g_io_request(bp, cp);
1409	}
1410}
1411
1412int
1413g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1414    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1415{
1416
1417	if (sd->sd_disk == NULL)
1418		return (ENXIO);
1419	if (sd->sd_disk->d_kd.di.dumper == NULL)
1420		return (EOPNOTSUPP);
1421	return (dump_write(&sd->sd_disk->d_kd.di,
1422	    virtual, physical,
1423	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1424	    length));
1425}
1426
1427static void
1428g_raid_disk_done(struct bio *bp)
1429{
1430	struct g_raid_softc *sc;
1431	struct g_raid_subdisk *sd;
1432
1433	sd = bp->bio_caller1;
1434	sc = sd->sd_softc;
1435	mtx_lock(&sc->sc_queue_mtx);
1436	bioq_insert_tail(&sc->sc_queue, bp);
1437	mtx_unlock(&sc->sc_queue_mtx);
1438	if (!dumping)
1439		wakeup(sc);
1440}
1441
1442static void
1443g_raid_disk_done_request(struct bio *bp)
1444{
1445	struct g_raid_softc *sc;
1446	struct g_raid_disk *disk;
1447	struct g_raid_subdisk *sd;
1448	struct g_raid_volume *vol;
1449
1450	g_topology_assert_not();
1451
1452	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1453	sd = bp->bio_caller1;
1454	sc = sd->sd_softc;
1455	vol = sd->sd_volume;
1456	if (bp->bio_from != NULL) {
1457		bp->bio_from->index--;
1458		disk = bp->bio_from->private;
1459		if (disk == NULL)
1460			g_raid_kill_consumer(sc, bp->bio_from);
1461	}
1462	bp->bio_offset -= sd->sd_offset;
1463
1464	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1465}
1466
1467static void
1468g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1469{
1470
1471	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1472		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1473	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1474		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1475	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1476		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1477	else
1478		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1479	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1480		KASSERT(ep->e_error == 0,
1481		    ("Error cannot be handled."));
1482		g_raid_event_free(ep);
1483	} else {
1484		ep->e_flags |= G_RAID_EVENT_DONE;
1485		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1486		mtx_lock(&sc->sc_queue_mtx);
1487		wakeup(ep);
1488		mtx_unlock(&sc->sc_queue_mtx);
1489	}
1490}
1491
1492/*
1493 * Worker thread.
1494 */
1495static void
1496g_raid_worker(void *arg)
1497{
1498	struct g_raid_softc *sc;
1499	struct g_raid_event *ep;
1500	struct g_raid_volume *vol;
1501	struct bio *bp;
1502	struct timeval now, t;
1503	int timeout, rv;
1504
1505	sc = arg;
1506	thread_lock(curthread);
1507	sched_prio(curthread, PRIBIO);
1508	thread_unlock(curthread);
1509
1510	sx_xlock(&sc->sc_lock);
1511	for (;;) {
1512		mtx_lock(&sc->sc_queue_mtx);
1513		/*
1514		 * First take a look at events.
1515		 * This is important to handle events before any I/O requests.
1516		 */
1517		bp = NULL;
1518		vol = NULL;
1519		rv = 0;
1520		ep = TAILQ_FIRST(&sc->sc_events);
1521		if (ep != NULL)
1522			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1523		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1524			;
1525		else {
1526			getmicrouptime(&now);
1527			t = now;
1528			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1529				if (bioq_first(&vol->v_inflight) == NULL &&
1530				    vol->v_tr &&
1531				    timevalcmp(&vol->v_last_done, &t, < ))
1532					t = vol->v_last_done;
1533			}
1534			timevalsub(&t, &now);
1535			timeout = g_raid_idle_threshold +
1536			    t.tv_sec * 1000000 + t.tv_usec;
1537			if (timeout > 0) {
1538				/*
1539				 * Two steps to avoid overflows at HZ=1000
1540				 * and idle timeouts > 2.1s.  Some rounding
1541				 * errors can occur, but they are < 1tick,
1542				 * which is deemed to be close enough for
1543				 * this purpose.
1544				 */
1545				int micpertic = 1000000 / hz;
1546				timeout = (timeout + micpertic - 1) / micpertic;
1547				sx_xunlock(&sc->sc_lock);
1548				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1549				    PRIBIO | PDROP, "-", timeout);
1550				sx_xlock(&sc->sc_lock);
1551				goto process;
1552			} else
1553				rv = EWOULDBLOCK;
1554		}
1555		mtx_unlock(&sc->sc_queue_mtx);
1556process:
1557		if (ep != NULL) {
1558			g_raid_handle_event(sc, ep);
1559		} else if (bp != NULL) {
1560			if (bp->bio_to != NULL &&
1561			    bp->bio_to->geom == sc->sc_geom)
1562				g_raid_start_request(bp);
1563			else
1564				g_raid_disk_done_request(bp);
1565		} else if (rv == EWOULDBLOCK) {
1566			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1567				g_raid_clean(vol, -1);
1568				if (bioq_first(&vol->v_inflight) == NULL &&
1569				    vol->v_tr) {
1570					t.tv_sec = g_raid_idle_threshold / 1000000;
1571					t.tv_usec = g_raid_idle_threshold % 1000000;
1572					timevaladd(&t, &vol->v_last_done);
1573					getmicrouptime(&now);
1574					if (timevalcmp(&t, &now, <= )) {
1575						G_RAID_TR_IDLE(vol->v_tr);
1576						vol->v_last_done = now;
1577					}
1578				}
1579			}
1580		}
1581		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1582			g_raid_destroy_node(sc, 1);	/* May not return. */
1583	}
1584}
1585
1586static void
1587g_raid_poll(struct g_raid_softc *sc)
1588{
1589	struct g_raid_event *ep;
1590	struct bio *bp;
1591
1592	sx_xlock(&sc->sc_lock);
1593	mtx_lock(&sc->sc_queue_mtx);
1594	/*
1595	 * First take a look at events.
1596	 * This is important to handle events before any I/O requests.
1597	 */
1598	ep = TAILQ_FIRST(&sc->sc_events);
1599	if (ep != NULL) {
1600		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1601		mtx_unlock(&sc->sc_queue_mtx);
1602		g_raid_handle_event(sc, ep);
1603		goto out;
1604	}
1605	bp = bioq_takefirst(&sc->sc_queue);
1606	if (bp != NULL) {
1607		mtx_unlock(&sc->sc_queue_mtx);
1608		if (bp->bio_from == NULL ||
1609		    bp->bio_from->geom != sc->sc_geom)
1610			g_raid_start_request(bp);
1611		else
1612			g_raid_disk_done_request(bp);
1613	}
1614out:
1615	sx_xunlock(&sc->sc_lock);
1616}
1617
1618static void
1619g_raid_launch_provider(struct g_raid_volume *vol)
1620{
1621	struct g_raid_disk *disk;
1622	struct g_raid_subdisk *sd;
1623	struct g_raid_softc *sc;
1624	struct g_provider *pp;
1625	char name[G_RAID_MAX_VOLUMENAME];
1626	off_t off;
1627	int i;
1628
1629	sc = vol->v_softc;
1630	sx_assert(&sc->sc_lock, SX_LOCKED);
1631
1632	g_topology_lock();
1633	/* Try to name provider with volume name. */
1634	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1635	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1636	    g_provider_by_name(name) != NULL) {
1637		/* Otherwise use sequential volume number. */
1638		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1639	}
1640
1641	pp = g_new_providerf(sc->sc_geom, "%s", name);
1642	pp->flags |= G_PF_DIRECT_RECEIVE;
1643	if (vol->v_tr->tro_class->trc_accept_unmapped) {
1644		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1645		for (i = 0; i < vol->v_disks_count; i++) {
1646			sd = &vol->v_subdisks[i];
1647			if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1648				continue;
1649			if ((sd->sd_disk->d_consumer->provider->flags &
1650			    G_PF_ACCEPT_UNMAPPED) == 0)
1651				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
1652		}
1653	}
1654	pp->private = vol;
1655	pp->mediasize = vol->v_mediasize;
1656	pp->sectorsize = vol->v_sectorsize;
1657	pp->stripesize = 0;
1658	pp->stripeoffset = 0;
1659	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1660	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1661	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1662	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1663		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1664		    disk->d_consumer != NULL &&
1665		    disk->d_consumer->provider != NULL) {
1666			pp->stripesize = disk->d_consumer->provider->stripesize;
1667			off = disk->d_consumer->provider->stripeoffset;
1668			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1669			if (off > 0)
1670				pp->stripeoffset %= off;
1671		}
1672		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1673			pp->stripesize *= (vol->v_disks_count - 1);
1674			pp->stripeoffset *= (vol->v_disks_count - 1);
1675		}
1676	} else
1677		pp->stripesize = vol->v_strip_size;
1678	vol->v_provider = pp;
1679	g_error_provider(pp, 0);
1680	g_topology_unlock();
1681	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1682	    pp->name, vol->v_name);
1683}
1684
1685static void
1686g_raid_destroy_provider(struct g_raid_volume *vol)
1687{
1688	struct g_raid_softc *sc;
1689	struct g_provider *pp;
1690	struct bio *bp, *tmp;
1691
1692	g_topology_assert_not();
1693	sc = vol->v_softc;
1694	pp = vol->v_provider;
1695	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1696
1697	g_topology_lock();
1698	g_error_provider(pp, ENXIO);
1699	mtx_lock(&sc->sc_queue_mtx);
1700	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1701		if (bp->bio_to != pp)
1702			continue;
1703		bioq_remove(&sc->sc_queue, bp);
1704		g_io_deliver(bp, ENXIO);
1705	}
1706	mtx_unlock(&sc->sc_queue_mtx);
1707	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1708	    pp->name, vol->v_name);
1709	g_wither_provider(pp, ENXIO);
1710	g_topology_unlock();
1711	vol->v_provider = NULL;
1712}
1713
1714/*
1715 * Update device state.
1716 */
1717static int
1718g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1719{
1720	struct g_raid_softc *sc;
1721
1722	sc = vol->v_softc;
1723	sx_assert(&sc->sc_lock, SX_XLOCKED);
1724
1725	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1726	    g_raid_volume_event2str(event),
1727	    vol->v_name);
1728	switch (event) {
1729	case G_RAID_VOLUME_E_DOWN:
1730		if (vol->v_provider != NULL)
1731			g_raid_destroy_provider(vol);
1732		break;
1733	case G_RAID_VOLUME_E_UP:
1734		if (vol->v_provider == NULL)
1735			g_raid_launch_provider(vol);
1736		break;
1737	case G_RAID_VOLUME_E_START:
1738		if (vol->v_tr)
1739			G_RAID_TR_START(vol->v_tr);
1740		return (0);
1741	default:
1742		if (sc->sc_md)
1743			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1744		return (0);
1745	}
1746
1747	/* Manage root mount release. */
1748	if (vol->v_starting) {
1749		vol->v_starting = 0;
1750		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1751		root_mount_rel(vol->v_rootmount);
1752		vol->v_rootmount = NULL;
1753	}
1754	if (vol->v_stopping && vol->v_provider_open == 0)
1755		g_raid_destroy_volume(vol);
1756	return (0);
1757}
1758
1759/*
1760 * Update subdisk state.
1761 */
1762static int
1763g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1764{
1765	struct g_raid_softc *sc;
1766	struct g_raid_volume *vol;
1767
1768	sc = sd->sd_softc;
1769	vol = sd->sd_volume;
1770	sx_assert(&sc->sc_lock, SX_XLOCKED);
1771
1772	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1773	    g_raid_subdisk_event2str(event),
1774	    vol->v_name, sd->sd_pos,
1775	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1776	if (vol->v_tr)
1777		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1778
1779	return (0);
1780}
1781
1782/*
1783 * Update disk state.
1784 */
1785static int
1786g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1787{
1788	struct g_raid_softc *sc;
1789
1790	sc = disk->d_softc;
1791	sx_assert(&sc->sc_lock, SX_XLOCKED);
1792
1793	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1794	    g_raid_disk_event2str(event),
1795	    g_raid_get_diskname(disk));
1796
1797	if (sc->sc_md)
1798		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1799	return (0);
1800}
1801
1802/*
1803 * Node event.
1804 */
1805static int
1806g_raid_update_node(struct g_raid_softc *sc, u_int event)
1807{
1808	sx_assert(&sc->sc_lock, SX_XLOCKED);
1809
1810	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1811	    g_raid_node_event2str(event));
1812
1813	if (event == G_RAID_NODE_E_WAKE)
1814		return (0);
1815	if (sc->sc_md)
1816		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1817	return (0);
1818}
1819
1820static int
1821g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1822{
1823	struct g_raid_volume *vol;
1824	struct g_raid_softc *sc;
1825	int dcw, opens, error = 0;
1826
1827	g_topology_assert();
1828	sc = pp->geom->softc;
1829	vol = pp->private;
1830	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1831	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1832
1833	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1834	    acr, acw, ace);
1835	dcw = pp->acw + acw;
1836
1837	g_topology_unlock();
1838	sx_xlock(&sc->sc_lock);
1839	/* Deny new opens while dying. */
1840	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1841		error = ENXIO;
1842		goto out;
1843	}
1844	/* Deny write opens for read-only volumes. */
1845	if (vol->v_read_only && acw > 0) {
1846		error = EROFS;
1847		goto out;
1848	}
1849	if (dcw == 0)
1850		g_raid_clean(vol, dcw);
1851	vol->v_provider_open += acr + acw + ace;
1852	/* Handle delayed node destruction. */
1853	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1854	    vol->v_provider_open == 0) {
1855		/* Count open volumes. */
1856		opens = g_raid_nopens(sc);
1857		if (opens == 0) {
1858			sc->sc_stopping = G_RAID_DESTROY_HARD;
1859			/* Wake up worker to make it selfdestruct. */
1860			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1861		}
1862	}
1863	/* Handle open volume destruction. */
1864	if (vol->v_stopping && vol->v_provider_open == 0)
1865		g_raid_destroy_volume(vol);
1866out:
1867	sx_xunlock(&sc->sc_lock);
1868	g_topology_lock();
1869	return (error);
1870}
1871
1872struct g_raid_softc *
1873g_raid_create_node(struct g_class *mp,
1874    const char *name, struct g_raid_md_object *md)
1875{
1876	struct g_raid_softc *sc;
1877	struct g_geom *gp;
1878	int error;
1879
1880	g_topology_assert();
1881	G_RAID_DEBUG(1, "Creating array %s.", name);
1882
1883	gp = g_new_geomf(mp, "%s", name);
1884	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1885	gp->start = g_raid_start;
1886	gp->orphan = g_raid_orphan;
1887	gp->access = g_raid_access;
1888	gp->dumpconf = g_raid_dumpconf;
1889
1890	sc->sc_md = md;
1891	sc->sc_geom = gp;
1892	sc->sc_flags = 0;
1893	TAILQ_INIT(&sc->sc_volumes);
1894	TAILQ_INIT(&sc->sc_disks);
1895	sx_init(&sc->sc_lock, "graid:lock");
1896	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1897	TAILQ_INIT(&sc->sc_events);
1898	bioq_init(&sc->sc_queue);
1899	gp->softc = sc;
1900	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1901	    "g_raid %s", name);
1902	if (error != 0) {
1903		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1904		mtx_destroy(&sc->sc_queue_mtx);
1905		sx_destroy(&sc->sc_lock);
1906		g_destroy_geom(sc->sc_geom);
1907		free(sc, M_RAID);
1908		return (NULL);
1909	}
1910
1911	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1912	return (sc);
1913}
1914
1915struct g_raid_volume *
1916g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1917{
1918	struct g_raid_volume	*vol, *vol1;
1919	int i;
1920
1921	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1922	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1923	vol->v_softc = sc;
1924	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1925	vol->v_state = G_RAID_VOLUME_S_STARTING;
1926	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1927	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1928	vol->v_rotate_parity = 1;
1929	bioq_init(&vol->v_inflight);
1930	bioq_init(&vol->v_locked);
1931	LIST_INIT(&vol->v_locks);
1932	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1933		vol->v_subdisks[i].sd_softc = sc;
1934		vol->v_subdisks[i].sd_volume = vol;
1935		vol->v_subdisks[i].sd_pos = i;
1936		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1937	}
1938
1939	/* Find free ID for this volume. */
1940	g_topology_lock();
1941	vol1 = vol;
1942	if (id >= 0) {
1943		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1944			if (vol1->v_global_id == id)
1945				break;
1946		}
1947	}
1948	if (vol1 != NULL) {
1949		for (id = 0; ; id++) {
1950			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1951				if (vol1->v_global_id == id)
1952					break;
1953			}
1954			if (vol1 == NULL)
1955				break;
1956		}
1957	}
1958	vol->v_global_id = id;
1959	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1960	g_topology_unlock();
1961
1962	/* Delay root mounting. */
1963	vol->v_rootmount = root_mount_hold("GRAID");
1964	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1965	vol->v_starting = 1;
1966	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1967	return (vol);
1968}
1969
1970struct g_raid_disk *
1971g_raid_create_disk(struct g_raid_softc *sc)
1972{
1973	struct g_raid_disk	*disk;
1974
1975	G_RAID_DEBUG1(1, sc, "Creating disk.");
1976	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1977	disk->d_softc = sc;
1978	disk->d_state = G_RAID_DISK_S_NONE;
1979	TAILQ_INIT(&disk->d_subdisks);
1980	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
1981	return (disk);
1982}
1983
1984int g_raid_start_volume(struct g_raid_volume *vol)
1985{
1986	struct g_raid_tr_class *class;
1987	struct g_raid_tr_object *obj;
1988	int status;
1989
1990	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
1991	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
1992		if (!class->trc_enable)
1993			continue;
1994		G_RAID_DEBUG1(2, vol->v_softc,
1995		    "Tasting volume %s for %s transformation.",
1996		    vol->v_name, class->name);
1997		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
1998		    M_WAITOK);
1999		obj->tro_class = class;
2000		obj->tro_volume = vol;
2001		status = G_RAID_TR_TASTE(obj, vol);
2002		if (status != G_RAID_TR_TASTE_FAIL)
2003			break;
2004		kobj_delete((kobj_t)obj, M_RAID);
2005	}
2006	if (class == NULL) {
2007		G_RAID_DEBUG1(0, vol->v_softc,
2008		    "No transformation module found for %s.",
2009		    vol->v_name);
2010		vol->v_tr = NULL;
2011		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
2012		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
2013		    G_RAID_EVENT_VOLUME);
2014		return (-1);
2015	}
2016	G_RAID_DEBUG1(2, vol->v_softc,
2017	    "Transformation module %s chosen for %s.",
2018	    class->name, vol->v_name);
2019	vol->v_tr = obj;
2020	return (0);
2021}
2022
2023int
2024g_raid_destroy_node(struct g_raid_softc *sc, int worker)
2025{
2026	struct g_raid_volume *vol, *tmpv;
2027	struct g_raid_disk *disk, *tmpd;
2028	int error = 0;
2029
2030	sc->sc_stopping = G_RAID_DESTROY_HARD;
2031	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
2032		if (g_raid_destroy_volume(vol))
2033			error = EBUSY;
2034	}
2035	if (error)
2036		return (error);
2037	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
2038		if (g_raid_destroy_disk(disk))
2039			error = EBUSY;
2040	}
2041	if (error)
2042		return (error);
2043	if (sc->sc_md) {
2044		G_RAID_MD_FREE(sc->sc_md);
2045		kobj_delete((kobj_t)sc->sc_md, M_RAID);
2046		sc->sc_md = NULL;
2047	}
2048	if (sc->sc_geom != NULL) {
2049		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
2050		g_topology_lock();
2051		sc->sc_geom->softc = NULL;
2052		g_wither_geom(sc->sc_geom, ENXIO);
2053		g_topology_unlock();
2054		sc->sc_geom = NULL;
2055	} else
2056		G_RAID_DEBUG(1, "Array destroyed.");
2057	if (worker) {
2058		g_raid_event_cancel(sc, sc);
2059		mtx_destroy(&sc->sc_queue_mtx);
2060		sx_xunlock(&sc->sc_lock);
2061		sx_destroy(&sc->sc_lock);
2062		wakeup(&sc->sc_stopping);
2063		free(sc, M_RAID);
2064		curthread->td_pflags &= ~TDP_GEOM;
2065		G_RAID_DEBUG(1, "Thread exiting.");
2066		kproc_exit(0);
2067	} else {
2068		/* Wake up worker to make it selfdestruct. */
2069		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2070	}
2071	return (0);
2072}
2073
2074int
2075g_raid_destroy_volume(struct g_raid_volume *vol)
2076{
2077	struct g_raid_softc *sc;
2078	struct g_raid_disk *disk;
2079	int i;
2080
2081	sc = vol->v_softc;
2082	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2083	vol->v_stopping = 1;
2084	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2085		if (vol->v_tr) {
2086			G_RAID_TR_STOP(vol->v_tr);
2087			return (EBUSY);
2088		} else
2089			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2090	}
2091	if (g_raid_event_check(sc, vol) != 0)
2092		return (EBUSY);
2093	if (vol->v_provider != NULL)
2094		return (EBUSY);
2095	if (vol->v_provider_open != 0)
2096		return (EBUSY);
2097	if (vol->v_tr) {
2098		G_RAID_TR_FREE(vol->v_tr);
2099		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2100		vol->v_tr = NULL;
2101	}
2102	if (vol->v_rootmount)
2103		root_mount_rel(vol->v_rootmount);
2104	g_topology_lock();
2105	LIST_REMOVE(vol, v_global_next);
2106	g_topology_unlock();
2107	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2108	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2109		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2110		disk = vol->v_subdisks[i].sd_disk;
2111		if (disk == NULL)
2112			continue;
2113		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2114	}
2115	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2116	if (sc->sc_md)
2117		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2118	g_raid_event_cancel(sc, vol);
2119	free(vol, M_RAID);
2120	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2121		/* Wake up worker to let it selfdestruct. */
2122		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2123	}
2124	return (0);
2125}
2126
2127int
2128g_raid_destroy_disk(struct g_raid_disk *disk)
2129{
2130	struct g_raid_softc *sc;
2131	struct g_raid_subdisk *sd, *tmp;
2132
2133	sc = disk->d_softc;
2134	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2135	if (disk->d_consumer) {
2136		g_raid_kill_consumer(sc, disk->d_consumer);
2137		disk->d_consumer = NULL;
2138	}
2139	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2140		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2141		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2142		    G_RAID_EVENT_SUBDISK);
2143		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2144		sd->sd_disk = NULL;
2145	}
2146	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2147	if (sc->sc_md)
2148		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2149	g_raid_event_cancel(sc, disk);
2150	free(disk, M_RAID);
2151	return (0);
2152}
2153
2154int
2155g_raid_destroy(struct g_raid_softc *sc, int how)
2156{
2157	int error, opens;
2158
2159	g_topology_assert_not();
2160	if (sc == NULL)
2161		return (ENXIO);
2162	sx_assert(&sc->sc_lock, SX_XLOCKED);
2163
2164	/* Count open volumes. */
2165	opens = g_raid_nopens(sc);
2166
2167	/* React on some opened volumes. */
2168	if (opens > 0) {
2169		switch (how) {
2170		case G_RAID_DESTROY_SOFT:
2171			G_RAID_DEBUG1(1, sc,
2172			    "%d volumes are still open.",
2173			    opens);
2174			sx_xunlock(&sc->sc_lock);
2175			return (EBUSY);
2176		case G_RAID_DESTROY_DELAYED:
2177			G_RAID_DEBUG1(1, sc,
2178			    "Array will be destroyed on last close.");
2179			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2180			sx_xunlock(&sc->sc_lock);
2181			return (EBUSY);
2182		case G_RAID_DESTROY_HARD:
2183			G_RAID_DEBUG1(1, sc,
2184			    "%d volumes are still open.",
2185			    opens);
2186		}
2187	}
2188
2189	/* Mark node for destruction. */
2190	sc->sc_stopping = G_RAID_DESTROY_HARD;
2191	/* Wake up worker to let it selfdestruct. */
2192	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2193	/* Sleep until node destroyed. */
2194	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2195	    PRIBIO | PDROP, "r:destroy", hz * 3);
2196	return (error == EWOULDBLOCK ? EBUSY : 0);
2197}
2198
2199static void
2200g_raid_taste_orphan(struct g_consumer *cp)
2201{
2202
2203	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2204	    cp->provider->name));
2205}
2206
2207static struct g_geom *
2208g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2209{
2210	struct g_consumer *cp;
2211	struct g_geom *gp, *geom;
2212	struct g_raid_md_class *class;
2213	struct g_raid_md_object *obj;
2214	int status;
2215
2216	g_topology_assert();
2217	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2218	if (!g_raid_enable)
2219		return (NULL);
2220	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2221
2222	geom = NULL;
2223	status = G_RAID_MD_TASTE_FAIL;
2224	gp = g_new_geomf(mp, "raid:taste");
2225	/*
2226	 * This orphan function should be never called.
2227	 */
2228	gp->orphan = g_raid_taste_orphan;
2229	cp = g_new_consumer(gp);
2230	cp->flags |= G_CF_DIRECT_RECEIVE;
2231	g_attach(cp, pp);
2232	if (g_access(cp, 1, 0, 0) != 0)
2233		goto ofail;
2234
2235	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2236		if (!class->mdc_enable)
2237			continue;
2238		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2239		    pp->name, class->name);
2240		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2241		    M_WAITOK);
2242		obj->mdo_class = class;
2243		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2244		if (status != G_RAID_MD_TASTE_NEW)
2245			kobj_delete((kobj_t)obj, M_RAID);
2246		if (status != G_RAID_MD_TASTE_FAIL)
2247			break;
2248	}
2249
2250	if (status == G_RAID_MD_TASTE_FAIL)
2251		(void)g_access(cp, -1, 0, 0);
2252ofail:
2253	g_detach(cp);
2254	g_destroy_consumer(cp);
2255	g_destroy_geom(gp);
2256	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2257	return (geom);
2258}
2259
2260int
2261g_raid_create_node_format(const char *format, struct gctl_req *req,
2262    struct g_geom **gp)
2263{
2264	struct g_raid_md_class *class;
2265	struct g_raid_md_object *obj;
2266	int status;
2267
2268	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2269	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2270		if (strcasecmp(class->name, format) == 0)
2271			break;
2272	}
2273	if (class == NULL) {
2274		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2275		return (G_RAID_MD_TASTE_FAIL);
2276	}
2277	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2278	    M_WAITOK);
2279	obj->mdo_class = class;
2280	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2281	if (status != G_RAID_MD_TASTE_NEW)
2282		kobj_delete((kobj_t)obj, M_RAID);
2283	return (status);
2284}
2285
2286static int
2287g_raid_destroy_geom(struct gctl_req *req __unused,
2288    struct g_class *mp __unused, struct g_geom *gp)
2289{
2290	struct g_raid_softc *sc;
2291	int error;
2292
2293	g_topology_unlock();
2294	sc = gp->softc;
2295	sx_xlock(&sc->sc_lock);
2296	g_cancel_event(sc);
2297	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2298	g_topology_lock();
2299	return (error);
2300}
2301
2302void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2303    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2304{
2305
2306	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2307		return;
2308	if (sc->sc_md)
2309		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2310}
2311
2312void g_raid_fail_disk(struct g_raid_softc *sc,
2313    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2314{
2315
2316	if (disk == NULL)
2317		disk = sd->sd_disk;
2318	if (disk == NULL) {
2319		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2320		return;
2321	}
2322	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2323		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2324		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2325		return;
2326	}
2327	if (sc->sc_md)
2328		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2329}
2330
2331static void
2332g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2333    struct g_consumer *cp, struct g_provider *pp)
2334{
2335	struct g_raid_softc *sc;
2336	struct g_raid_volume *vol;
2337	struct g_raid_subdisk *sd;
2338	struct g_raid_disk *disk;
2339	int i, s;
2340
2341	g_topology_assert();
2342
2343	sc = gp->softc;
2344	if (sc == NULL)
2345		return;
2346	if (pp != NULL) {
2347		vol = pp->private;
2348		g_topology_unlock();
2349		sx_xlock(&sc->sc_lock);
2350		sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
2351		    sc->sc_md->mdo_class->name,
2352		    g_raid_volume_level2str(vol->v_raid_level,
2353		    vol->v_raid_level_qualifier));
2354		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2355		    vol->v_name);
2356		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2357		    g_raid_volume_level2str(vol->v_raid_level,
2358		    vol->v_raid_level_qualifier));
2359		sbuf_printf(sb,
2360		    "%s<Transformation>%s</Transformation>\n", indent,
2361		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2362		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2363		    vol->v_disks_count);
2364		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2365		    vol->v_strip_size);
2366		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2367		    g_raid_volume_state2str(vol->v_state));
2368		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2369		    vol->v_dirty ? "Yes" : "No");
2370		sbuf_printf(sb, "%s<Subdisks>", indent);
2371		for (i = 0; i < vol->v_disks_count; i++) {
2372			sd = &vol->v_subdisks[i];
2373			if (sd->sd_disk != NULL &&
2374			    sd->sd_disk->d_consumer != NULL) {
2375				sbuf_printf(sb, "%s ",
2376				    g_raid_get_diskname(sd->sd_disk));
2377			} else {
2378				sbuf_cat(sb, "NONE ");
2379			}
2380			sbuf_printf(sb, "(%s",
2381			    g_raid_subdisk_state2str(sd->sd_state));
2382			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2383			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2384				sbuf_printf(sb, " %d%%",
2385				    (int)(sd->sd_rebuild_pos * 100 /
2386				     sd->sd_size));
2387			}
2388			sbuf_cat(sb, ")");
2389			if (i + 1 < vol->v_disks_count)
2390				sbuf_cat(sb, ", ");
2391		}
2392		sbuf_cat(sb, "</Subdisks>\n");
2393		sx_xunlock(&sc->sc_lock);
2394		g_topology_lock();
2395	} else if (cp != NULL) {
2396		disk = cp->private;
2397		if (disk == NULL)
2398			return;
2399		g_topology_unlock();
2400		sx_xlock(&sc->sc_lock);
2401		sbuf_printf(sb, "%s<State>%s", indent,
2402		    g_raid_disk_state2str(disk->d_state));
2403		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2404			sbuf_cat(sb, " (");
2405			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2406				sbuf_printf(sb, "%s",
2407				    g_raid_subdisk_state2str(sd->sd_state));
2408				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2409				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2410					sbuf_printf(sb, " %d%%",
2411					    (int)(sd->sd_rebuild_pos * 100 /
2412					     sd->sd_size));
2413				}
2414				if (TAILQ_NEXT(sd, sd_next))
2415					sbuf_cat(sb, ", ");
2416			}
2417			sbuf_cat(sb, ")");
2418		}
2419		sbuf_cat(sb, "</State>\n");
2420		sbuf_printf(sb, "%s<Subdisks>", indent);
2421		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2422			sbuf_printf(sb, "r%d(%s):%d@%ju",
2423			    sd->sd_volume->v_global_id,
2424			    sd->sd_volume->v_name,
2425			    sd->sd_pos, (uintmax_t)sd->sd_offset);
2426			if (TAILQ_NEXT(sd, sd_next))
2427				sbuf_cat(sb, ", ");
2428		}
2429		sbuf_cat(sb, "</Subdisks>\n");
2430		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2431		    disk->d_read_errs);
2432		sx_xunlock(&sc->sc_lock);
2433		g_topology_lock();
2434	} else {
2435		g_topology_unlock();
2436		sx_xlock(&sc->sc_lock);
2437		if (sc->sc_md) {
2438			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2439			    sc->sc_md->mdo_class->name);
2440		}
2441		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2442			s = 0xff;
2443			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2444				if (vol->v_state < s)
2445					s = vol->v_state;
2446			}
2447			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2448			    g_raid_volume_state2str(s));
2449		}
2450		sx_xunlock(&sc->sc_lock);
2451		g_topology_lock();
2452	}
2453}
2454
2455static void
2456g_raid_shutdown_post_sync(void *arg, int howto)
2457{
2458	struct g_class *mp;
2459	struct g_geom *gp, *gp2;
2460	struct g_raid_softc *sc;
2461	struct g_raid_volume *vol;
2462
2463	mp = arg;
2464	g_topology_lock();
2465	g_raid_shutdown = 1;
2466	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2467		if ((sc = gp->softc) == NULL)
2468			continue;
2469		g_topology_unlock();
2470		sx_xlock(&sc->sc_lock);
2471		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
2472			g_raid_clean(vol, -1);
2473		g_cancel_event(sc);
2474		g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2475		g_topology_lock();
2476	}
2477	g_topology_unlock();
2478}
2479
2480static void
2481g_raid_init(struct g_class *mp)
2482{
2483
2484	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
2485	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
2486	if (g_raid_post_sync == NULL)
2487		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2488	g_raid_started = 1;
2489}
2490
2491static void
2492g_raid_fini(struct g_class *mp)
2493{
2494
2495	if (g_raid_post_sync != NULL)
2496		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
2497	g_raid_started = 0;
2498}
2499
2500int
2501g_raid_md_modevent(module_t mod, int type, void *arg)
2502{
2503	struct g_raid_md_class *class, *c, *nc;
2504	int error;
2505
2506	error = 0;
2507	class = arg;
2508	switch (type) {
2509	case MOD_LOAD:
2510		c = LIST_FIRST(&g_raid_md_classes);
2511		if (c == NULL || c->mdc_priority > class->mdc_priority)
2512			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2513		else {
2514			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2515			    nc->mdc_priority < class->mdc_priority)
2516				c = nc;
2517			LIST_INSERT_AFTER(c, class, mdc_list);
2518		}
2519		if (g_raid_started)
2520			g_retaste(&g_raid_class);
2521		break;
2522	case MOD_UNLOAD:
2523		LIST_REMOVE(class, mdc_list);
2524		break;
2525	default:
2526		error = EOPNOTSUPP;
2527		break;
2528	}
2529
2530	return (error);
2531}
2532
2533int
2534g_raid_tr_modevent(module_t mod, int type, void *arg)
2535{
2536	struct g_raid_tr_class *class, *c, *nc;
2537	int error;
2538
2539	error = 0;
2540	class = arg;
2541	switch (type) {
2542	case MOD_LOAD:
2543		c = LIST_FIRST(&g_raid_tr_classes);
2544		if (c == NULL || c->trc_priority > class->trc_priority)
2545			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2546		else {
2547			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2548			    nc->trc_priority < class->trc_priority)
2549				c = nc;
2550			LIST_INSERT_AFTER(c, class, trc_list);
2551		}
2552		break;
2553	case MOD_UNLOAD:
2554		LIST_REMOVE(class, trc_list);
2555		break;
2556	default:
2557		error = EOPNOTSUPP;
2558		break;
2559	}
2560
2561	return (error);
2562}
2563
2564/*
2565 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2566 * to reduce module priority, allowing submodules to register them first.
2567 */
2568static moduledata_t g_raid_mod = {
2569	"g_raid",
2570	g_modevent,
2571	&g_raid_class
2572};
2573DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2574MODULE_VERSION(geom_raid, 0);
2575