1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * Copyright (c) 2000 - 2008 S��ren Schmidt <sos@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#include <sys/param.h>
34#include <sys/bio.h>
35#include <sys/endian.h>
36#include <sys/kernel.h>
37#include <sys/kobj.h>
38#include <sys/limits.h>
39#include <sys/lock.h>
40#include <sys/malloc.h>
41#include <sys/mutex.h>
42#include <sys/systm.h>
43#include <sys/taskqueue.h>
44#include <sys/disk.h>
45#include <geom/geom.h>
46#include <geom/geom_dbg.h>
47#include "geom/raid/g_raid.h"
48#include "g_raid_md_if.h"
49
50static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata");
51
52struct intel_raid_map {
53	uint32_t	offset;
54	uint32_t	disk_sectors;
55	uint32_t	stripe_count;
56	uint16_t	strip_sectors;
57	uint8_t		status;
58#define INTEL_S_READY           0x00
59#define INTEL_S_UNINITIALIZED   0x01
60#define INTEL_S_DEGRADED        0x02
61#define INTEL_S_FAILURE         0x03
62
63	uint8_t		type;
64#define INTEL_T_RAID0           0x00
65#define INTEL_T_RAID1           0x01
66#define INTEL_T_RAID5           0x05
67
68	uint8_t		total_disks;
69	uint8_t		total_domains;
70	uint8_t		failed_disk_num;
71	uint8_t		ddf;
72	uint32_t	offset_hi;
73	uint32_t	disk_sectors_hi;
74	uint32_t	stripe_count_hi;
75	uint32_t	filler_2[4];
76	uint32_t	disk_idx[1];	/* total_disks entries. */
77#define INTEL_DI_IDX	0x00ffffff
78#define INTEL_DI_RBLD	0x01000000
79} __packed;
80
81struct intel_raid_vol {
82	uint8_t		name[16];
83	u_int64_t	total_sectors __packed;
84	uint32_t	state;
85#define INTEL_ST_BOOTABLE		0x00000001
86#define INTEL_ST_BOOT_DEVICE		0x00000002
87#define INTEL_ST_READ_COALESCING	0x00000004
88#define INTEL_ST_WRITE_COALESCING	0x00000008
89#define INTEL_ST_LAST_SHUTDOWN_DIRTY	0x00000010
90#define INTEL_ST_HIDDEN_AT_BOOT		0x00000020
91#define INTEL_ST_CURRENTLY_HIDDEN	0x00000040
92#define INTEL_ST_VERIFY_AND_FIX		0x00000080
93#define INTEL_ST_MAP_STATE_UNINIT	0x00000100
94#define INTEL_ST_NO_AUTO_RECOVERY	0x00000200
95#define INTEL_ST_CLONE_N_GO		0x00000400
96#define INTEL_ST_CLONE_MAN_SYNC		0x00000800
97#define INTEL_ST_CNG_MASTER_DISK_NUM	0x00001000
98	uint32_t	reserved;
99	uint8_t		migr_priority;
100	uint8_t		num_sub_vols;
101	uint8_t		tid;
102	uint8_t		cng_master_disk;
103	uint16_t	cache_policy;
104	uint8_t		cng_state;
105#define INTEL_CNGST_UPDATED		0
106#define INTEL_CNGST_NEEDS_UPDATE	1
107#define INTEL_CNGST_MASTER_MISSING	2
108	uint8_t		cng_sub_state;
109	uint32_t	filler_0[10];
110
111	uint32_t	curr_migr_unit;
112	uint32_t	checkpoint_id;
113	uint8_t		migr_state;
114	uint8_t		migr_type;
115#define INTEL_MT_INIT		0
116#define INTEL_MT_REBUILD	1
117#define INTEL_MT_VERIFY		2
118#define INTEL_MT_GEN_MIGR	3
119#define INTEL_MT_STATE_CHANGE	4
120#define INTEL_MT_REPAIR		5
121	uint8_t		dirty;
122	uint8_t		fs_state;
123	uint16_t	verify_errors;
124	uint16_t	bad_blocks;
125	uint32_t	curr_migr_unit_hi;
126	uint32_t	filler_1[3];
127	struct intel_raid_map map[1];	/* 2 entries if migr_state != 0. */
128} __packed;
129
130struct intel_raid_disk {
131#define INTEL_SERIAL_LEN	16
132	uint8_t		serial[INTEL_SERIAL_LEN];
133	uint32_t	sectors;
134	uint32_t	id;
135	uint32_t	flags;
136#define INTEL_F_SPARE		0x01
137#define INTEL_F_ASSIGNED	0x02
138#define INTEL_F_FAILED		0x04
139#define INTEL_F_ONLINE		0x08
140#define INTEL_F_DISABLED	0x80
141	uint32_t	owner_cfg_num;
142	uint32_t	sectors_hi;
143	uint32_t	filler[3];
144} __packed;
145
146struct intel_raid_conf {
147	uint8_t		intel_id[24];
148#define INTEL_MAGIC             "Intel Raid ISM Cfg Sig. "
149
150	uint8_t		version[6];
151#define INTEL_VERSION_1000	"1.0.00"	/* RAID0 */
152#define INTEL_VERSION_1100	"1.1.00"	/* RAID1 */
153#define INTEL_VERSION_1200	"1.2.00"	/* Many volumes */
154#define INTEL_VERSION_1201	"1.2.01"	/* 3 or 4 disks */
155#define INTEL_VERSION_1202	"1.2.02"	/* RAID5 */
156#define INTEL_VERSION_1204	"1.2.04"	/* 5 or 6 disks */
157#define INTEL_VERSION_1206	"1.2.06"	/* CNG */
158#define INTEL_VERSION_1300	"1.3.00"	/* Attributes */
159
160	uint8_t		dummy_0[2];
161	uint32_t	checksum;
162	uint32_t	config_size;
163	uint32_t	config_id;
164	uint32_t	generation;
165	uint32_t	error_log_size;
166	uint32_t	attributes;
167#define INTEL_ATTR_RAID0	0x00000001
168#define INTEL_ATTR_RAID1	0x00000002
169#define INTEL_ATTR_RAID10	0x00000004
170#define INTEL_ATTR_RAID1E	0x00000008
171#define INTEL_ATTR_RAID5	0x00000010
172#define INTEL_ATTR_RAIDCNG	0x00000020
173#define INTEL_ATTR_EXT_STRIP	0x00000040
174#define INTEL_ATTR_NVM_CACHE	0x02000000
175#define INTEL_ATTR_2TB_DISK	0x04000000
176#define INTEL_ATTR_BBM		0x08000000
177#define INTEL_ATTR_NVM_CACHE2	0x10000000
178#define INTEL_ATTR_2TB		0x20000000
179#define INTEL_ATTR_PM		0x40000000
180#define INTEL_ATTR_CHECKSUM	0x80000000
181
182	uint8_t		total_disks;
183	uint8_t		total_volumes;
184	uint8_t		error_log_pos;
185	uint8_t		dummy_2[1];
186	uint32_t	cache_size;
187	uint32_t	orig_config_id;
188	uint32_t	pwr_cycle_count;
189	uint32_t	bbm_log_size;
190	uint32_t	filler_0[35];
191	struct intel_raid_disk	disk[1];	/* total_disks entries. */
192	/* Here goes total_volumes of struct intel_raid_vol. */
193} __packed;
194
195#define INTEL_ATTR_SUPPORTED	( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 |	\
196    INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 |		\
197    INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK |	\
198    INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM )
199
200#define INTEL_MAX_MD_SIZE(ndisks)				\
201    (sizeof(struct intel_raid_conf) +				\
202     sizeof(struct intel_raid_disk) * (ndisks - 1) +		\
203     sizeof(struct intel_raid_vol) * 2 +			\
204     sizeof(struct intel_raid_map) * 2 +			\
205     sizeof(uint32_t) * (ndisks - 1) * 4)
206
207struct g_raid_md_intel_perdisk {
208	struct intel_raid_conf	*pd_meta;
209	int			 pd_disk_pos;
210	struct intel_raid_disk	 pd_disk_meta;
211};
212
213struct g_raid_md_intel_pervolume {
214	int			 pv_volume_pos;
215	int			 pv_cng;
216	int			 pv_cng_man_sync;
217	int			 pv_cng_master_disk;
218};
219
220struct g_raid_md_intel_object {
221	struct g_raid_md_object	 mdio_base;
222	uint32_t		 mdio_config_id;
223	uint32_t		 mdio_orig_config_id;
224	uint32_t		 mdio_generation;
225	struct intel_raid_conf	*mdio_meta;
226	struct callout		 mdio_start_co;	/* STARTING state timer. */
227	int			 mdio_disks_present;
228	int			 mdio_started;
229	int			 mdio_incomplete;
230	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
231};
232
233static g_raid_md_create_t g_raid_md_create_intel;
234static g_raid_md_taste_t g_raid_md_taste_intel;
235static g_raid_md_event_t g_raid_md_event_intel;
236static g_raid_md_ctl_t g_raid_md_ctl_intel;
237static g_raid_md_write_t g_raid_md_write_intel;
238static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel;
239static g_raid_md_free_disk_t g_raid_md_free_disk_intel;
240static g_raid_md_free_volume_t g_raid_md_free_volume_intel;
241static g_raid_md_free_t g_raid_md_free_intel;
242
243static kobj_method_t g_raid_md_intel_methods[] = {
244	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_intel),
245	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_intel),
246	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_intel),
247	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_intel),
248	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_intel),
249	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_intel),
250	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_intel),
251	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_intel),
252	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_intel),
253	{ 0, 0 }
254};
255
256static struct g_raid_md_class g_raid_md_intel_class = {
257	"Intel",
258	g_raid_md_intel_methods,
259	sizeof(struct g_raid_md_intel_object),
260	.mdc_enable = 1,
261	.mdc_priority = 100
262};
263
264static struct intel_raid_map *
265intel_get_map(struct intel_raid_vol *mvol, int i)
266{
267	struct intel_raid_map *mmap;
268
269	if (i > (mvol->migr_state ? 1 : 0))
270		return (NULL);
271	mmap = &mvol->map[0];
272	for (; i > 0; i--) {
273		mmap = (struct intel_raid_map *)
274		    &mmap->disk_idx[mmap->total_disks];
275	}
276	return ((struct intel_raid_map *)mmap);
277}
278
279static struct intel_raid_vol *
280intel_get_volume(struct intel_raid_conf *meta, int i)
281{
282	struct intel_raid_vol *mvol;
283	struct intel_raid_map *mmap;
284
285	if (i > 1)
286		return (NULL);
287	mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks];
288	for (; i > 0; i--) {
289		mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0);
290		mvol = (struct intel_raid_vol *)
291		    &mmap->disk_idx[mmap->total_disks];
292	}
293	return (mvol);
294}
295
296static off_t
297intel_get_map_offset(struct intel_raid_map *mmap)
298{
299	off_t offset = (off_t)mmap->offset_hi << 32;
300
301	offset += mmap->offset;
302	return (offset);
303}
304
305static void
306intel_set_map_offset(struct intel_raid_map *mmap, off_t offset)
307{
308
309	mmap->offset = offset & 0xffffffff;
310	mmap->offset_hi = offset >> 32;
311}
312
313static off_t
314intel_get_map_disk_sectors(struct intel_raid_map *mmap)
315{
316	off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32;
317
318	disk_sectors += mmap->disk_sectors;
319	return (disk_sectors);
320}
321
322static void
323intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors)
324{
325
326	mmap->disk_sectors = disk_sectors & 0xffffffff;
327	mmap->disk_sectors_hi = disk_sectors >> 32;
328}
329
330static void
331intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count)
332{
333
334	mmap->stripe_count = stripe_count & 0xffffffff;
335	mmap->stripe_count_hi = stripe_count >> 32;
336}
337
338static off_t
339intel_get_disk_sectors(struct intel_raid_disk *disk)
340{
341	off_t sectors = (off_t)disk->sectors_hi << 32;
342
343	sectors += disk->sectors;
344	return (sectors);
345}
346
347static void
348intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors)
349{
350
351	disk->sectors = sectors & 0xffffffff;
352	disk->sectors_hi = sectors >> 32;
353}
354
355static off_t
356intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol)
357{
358	off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32;
359
360	curr_migr_unit += vol->curr_migr_unit;
361	return (curr_migr_unit);
362}
363
364static void
365intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit)
366{
367
368	vol->curr_migr_unit = curr_migr_unit & 0xffffffff;
369	vol->curr_migr_unit_hi = curr_migr_unit >> 32;
370}
371
372static char *
373intel_status2str(int status)
374{
375
376	switch (status) {
377	case INTEL_S_READY:
378		return ("READY");
379	case INTEL_S_UNINITIALIZED:
380		return ("UNINITIALIZED");
381	case INTEL_S_DEGRADED:
382		return ("DEGRADED");
383	case INTEL_S_FAILURE:
384		return ("FAILURE");
385	default:
386		return ("UNKNOWN");
387	}
388}
389
390static char *
391intel_type2str(int type)
392{
393
394	switch (type) {
395	case INTEL_T_RAID0:
396		return ("RAID0");
397	case INTEL_T_RAID1:
398		return ("RAID1");
399	case INTEL_T_RAID5:
400		return ("RAID5");
401	default:
402		return ("UNKNOWN");
403	}
404}
405
406static char *
407intel_cngst2str(int cng_state)
408{
409
410	switch (cng_state) {
411	case INTEL_CNGST_UPDATED:
412		return ("UPDATED");
413	case INTEL_CNGST_NEEDS_UPDATE:
414		return ("NEEDS_UPDATE");
415	case INTEL_CNGST_MASTER_MISSING:
416		return ("MASTER_MISSING");
417	default:
418		return ("UNKNOWN");
419	}
420}
421
422static char *
423intel_mt2str(int type)
424{
425
426	switch (type) {
427	case INTEL_MT_INIT:
428		return ("INIT");
429	case INTEL_MT_REBUILD:
430		return ("REBUILD");
431	case INTEL_MT_VERIFY:
432		return ("VERIFY");
433	case INTEL_MT_GEN_MIGR:
434		return ("GEN_MIGR");
435	case INTEL_MT_STATE_CHANGE:
436		return ("STATE_CHANGE");
437	case INTEL_MT_REPAIR:
438		return ("REPAIR");
439	default:
440		return ("UNKNOWN");
441	}
442}
443
444static void
445g_raid_md_intel_print(struct intel_raid_conf *meta)
446{
447	struct intel_raid_vol *mvol;
448	struct intel_raid_map *mmap;
449	int i, j, k;
450
451	if (g_raid_debug < 1)
452		return;
453
454	printf("********* ATA Intel MatrixRAID Metadata *********\n");
455	printf("intel_id            <%.24s>\n", meta->intel_id);
456	printf("version             <%.6s>\n", meta->version);
457	printf("checksum            0x%08x\n", meta->checksum);
458	printf("config_size         0x%08x\n", meta->config_size);
459	printf("config_id           0x%08x\n", meta->config_id);
460	printf("generation          0x%08x\n", meta->generation);
461	printf("error_log_size      %d\n", meta->error_log_size);
462	printf("attributes          0x%b\n", meta->attributes,
463		"\020"
464		"\001RAID0"
465		"\002RAID1"
466		"\003RAID10"
467		"\004RAID1E"
468		"\005RAID15"
469		"\006RAIDCNG"
470		"\007EXT_STRIP"
471		"\032NVM_CACHE"
472		"\0332TB_DISK"
473		"\034BBM"
474		"\035NVM_CACHE"
475		"\0362TB"
476		"\037PM"
477		"\040CHECKSUM");
478	printf("total_disks         %u\n", meta->total_disks);
479	printf("total_volumes       %u\n", meta->total_volumes);
480	printf("error_log_pos       %u\n", meta->error_log_pos);
481	printf("cache_size          %u\n", meta->cache_size);
482	printf("orig_config_id      0x%08x\n", meta->orig_config_id);
483	printf("pwr_cycle_count     %u\n", meta->pwr_cycle_count);
484	printf("bbm_log_size        %u\n", meta->bbm_log_size);
485	printf("Flags: S - Spare, A - Assigned, F - Failed, O - Online, D - Disabled\n");
486	printf("DISK#   serial disk_sectors disk_sectors_hi disk_id flags owner\n");
487	for (i = 0; i < meta->total_disks; i++ ) {
488		printf("    %d   <%.16s> %u %u 0x%08x 0x%b %08x\n", i,
489		    meta->disk[i].serial, meta->disk[i].sectors,
490		    meta->disk[i].sectors_hi, meta->disk[i].id,
491		    meta->disk[i].flags, "\20\01S\02A\03F\04O\05D",
492		    meta->disk[i].owner_cfg_num);
493	}
494	for (i = 0; i < meta->total_volumes; i++) {
495		mvol = intel_get_volume(meta, i);
496		printf(" ****** Volume %d ******\n", i);
497		printf(" name               %.16s\n", mvol->name);
498		printf(" total_sectors      %ju\n", mvol->total_sectors);
499		printf(" state              0x%b\n", mvol->state,
500			"\020"
501			"\001BOOTABLE"
502			"\002BOOT_DEVICE"
503			"\003READ_COALESCING"
504			"\004WRITE_COALESCING"
505			"\005LAST_SHUTDOWN_DIRTY"
506			"\006HIDDEN_AT_BOOT"
507			"\007CURRENTLY_HIDDEN"
508			"\010VERIFY_AND_FIX"
509			"\011MAP_STATE_UNINIT"
510			"\012NO_AUTO_RECOVERY"
511			"\013CLONE_N_GO"
512			"\014CLONE_MAN_SYNC"
513			"\015CNG_MASTER_DISK_NUM");
514		printf(" reserved           %u\n", mvol->reserved);
515		printf(" migr_priority      %u\n", mvol->migr_priority);
516		printf(" num_sub_vols       %u\n", mvol->num_sub_vols);
517		printf(" tid                %u\n", mvol->tid);
518		printf(" cng_master_disk    %u\n", mvol->cng_master_disk);
519		printf(" cache_policy       %u\n", mvol->cache_policy);
520		printf(" cng_state          %u (%s)\n", mvol->cng_state,
521			intel_cngst2str(mvol->cng_state));
522		printf(" cng_sub_state      %u\n", mvol->cng_sub_state);
523		printf(" curr_migr_unit     %u\n", mvol->curr_migr_unit);
524		printf(" curr_migr_unit_hi  %u\n", mvol->curr_migr_unit_hi);
525		printf(" checkpoint_id      %u\n", mvol->checkpoint_id);
526		printf(" migr_state         %u\n", mvol->migr_state);
527		printf(" migr_type          %u (%s)\n", mvol->migr_type,
528			intel_mt2str(mvol->migr_type));
529		printf(" dirty              %u\n", mvol->dirty);
530		printf(" fs_state           %u\n", mvol->fs_state);
531		printf(" verify_errors      %u\n", mvol->verify_errors);
532		printf(" bad_blocks         %u\n", mvol->bad_blocks);
533
534		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
535			printf("  *** Map %d ***\n", j);
536			mmap = intel_get_map(mvol, j);
537			printf("  offset            %u\n", mmap->offset);
538			printf("  offset_hi         %u\n", mmap->offset_hi);
539			printf("  disk_sectors      %u\n", mmap->disk_sectors);
540			printf("  disk_sectors_hi   %u\n", mmap->disk_sectors_hi);
541			printf("  stripe_count      %u\n", mmap->stripe_count);
542			printf("  stripe_count_hi   %u\n", mmap->stripe_count_hi);
543			printf("  strip_sectors     %u\n", mmap->strip_sectors);
544			printf("  status            %u (%s)\n", mmap->status,
545				intel_status2str(mmap->status));
546			printf("  type              %u (%s)\n", mmap->type,
547				intel_type2str(mmap->type));
548			printf("  total_disks       %u\n", mmap->total_disks);
549			printf("  total_domains     %u\n", mmap->total_domains);
550			printf("  failed_disk_num   %u\n", mmap->failed_disk_num);
551			printf("  ddf               %u\n", mmap->ddf);
552			printf("  disk_idx         ");
553			for (k = 0; k < mmap->total_disks; k++)
554				printf(" 0x%08x", mmap->disk_idx[k]);
555			printf("\n");
556		}
557	}
558	printf("=================================================\n");
559}
560
561static struct intel_raid_conf *
562intel_meta_copy(struct intel_raid_conf *meta)
563{
564	struct intel_raid_conf *nmeta;
565
566	nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK);
567	memcpy(nmeta, meta, meta->config_size);
568	return (nmeta);
569}
570
571static int
572intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
573{
574	int pos;
575
576	for (pos = 0; pos < meta->total_disks; pos++) {
577		if (strncmp(meta->disk[pos].serial,
578		    serial, INTEL_SERIAL_LEN) == 0)
579			return (pos);
580	}
581	return (-1);
582}
583
584static struct intel_raid_conf *
585intel_meta_read(struct g_consumer *cp)
586{
587	struct g_provider *pp;
588	struct intel_raid_conf *meta;
589	struct intel_raid_vol *mvol;
590	struct intel_raid_map *mmap, *mmap1;
591	char *buf;
592	int error, i, j, k, left, size;
593	uint32_t checksum, *ptr;
594
595	pp = cp->provider;
596
597	/* Read the anchor sector. */
598	buf = g_read_data(cp,
599	    pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error);
600	if (buf == NULL) {
601		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
602		    pp->name, error);
603		return (NULL);
604	}
605	meta = (struct intel_raid_conf *)buf;
606
607	/* Check if this is an Intel RAID struct */
608	if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) {
609		G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name);
610		g_free(buf);
611		return (NULL);
612	}
613	if (meta->config_size > 65536 ||
614	    meta->config_size < sizeof(struct intel_raid_conf)) {
615		G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d",
616		    meta->config_size);
617		g_free(buf);
618		return (NULL);
619	}
620	size = meta->config_size;
621	meta = malloc(size, M_MD_INTEL, M_WAITOK);
622	memcpy(meta, buf, min(size, pp->sectorsize));
623	g_free(buf);
624
625	/* Read all the rest, if needed. */
626	if (meta->config_size > pp->sectorsize) {
627		left = (meta->config_size - 1) / pp->sectorsize;
628		buf = g_read_data(cp,
629		    pp->mediasize - pp->sectorsize * (2 + left),
630		    pp->sectorsize * left, &error);
631		if (buf == NULL) {
632			G_RAID_DEBUG(1, "Cannot read remaining metadata"
633			    " part from %s (error=%d).",
634			    pp->name, error);
635			free(meta, M_MD_INTEL);
636			return (NULL);
637		}
638		memcpy(((char *)meta) + pp->sectorsize, buf,
639		    pp->sectorsize * left);
640		g_free(buf);
641	}
642
643	/* Check metadata checksum. */
644	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
645	    i < (meta->config_size / sizeof(uint32_t)); i++) {
646		checksum += *ptr++;
647	}
648	checksum -= meta->checksum;
649	if (checksum != meta->checksum) {
650		G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name);
651		free(meta, M_MD_INTEL);
652		return (NULL);
653	}
654
655	/* Validate metadata size. */
656	size = sizeof(struct intel_raid_conf) +
657	    sizeof(struct intel_raid_disk) * (meta->total_disks - 1) +
658	    sizeof(struct intel_raid_vol) * meta->total_volumes;
659	if (size > meta->config_size) {
660badsize:
661		G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d",
662		    meta->config_size, size);
663		free(meta, M_MD_INTEL);
664		return (NULL);
665	}
666	for (i = 0; i < meta->total_volumes; i++) {
667		mvol = intel_get_volume(meta, i);
668		mmap = intel_get_map(mvol, 0);
669		size += 4 * (mmap->total_disks - 1);
670		if (size > meta->config_size)
671			goto badsize;
672		if (mvol->migr_state) {
673			size += sizeof(struct intel_raid_map);
674			if (size > meta->config_size)
675				goto badsize;
676			mmap = intel_get_map(mvol, 1);
677			size += 4 * (mmap->total_disks - 1);
678			if (size > meta->config_size)
679				goto badsize;
680		}
681	}
682
683	g_raid_md_intel_print(meta);
684
685	if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) {
686		G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'",
687		    meta->version);
688		free(meta, M_MD_INTEL);
689		return (NULL);
690	}
691
692	if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 &&
693	    (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) {
694		G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x",
695		    meta->attributes & ~INTEL_ATTR_SUPPORTED);
696		free(meta, M_MD_INTEL);
697		return (NULL);
698	}
699
700	/* Validate disk indexes. */
701	for (i = 0; i < meta->total_volumes; i++) {
702		mvol = intel_get_volume(meta, i);
703		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
704			mmap = intel_get_map(mvol, j);
705			for (k = 0; k < mmap->total_disks; k++) {
706				if ((mmap->disk_idx[k] & INTEL_DI_IDX) >
707				    meta->total_disks) {
708					G_RAID_DEBUG(1, "Intel metadata disk"
709					    " index %d too big (>%d)",
710					    mmap->disk_idx[k] & INTEL_DI_IDX,
711					    meta->total_disks);
712					free(meta, M_MD_INTEL);
713					return (NULL);
714				}
715			}
716		}
717	}
718
719	/* Validate migration types. */
720	for (i = 0; i < meta->total_volumes; i++) {
721		mvol = intel_get_volume(meta, i);
722		/* Deny unknown migration types. */
723		if (mvol->migr_state &&
724		    mvol->migr_type != INTEL_MT_INIT &&
725		    mvol->migr_type != INTEL_MT_REBUILD &&
726		    mvol->migr_type != INTEL_MT_VERIFY &&
727		    mvol->migr_type != INTEL_MT_GEN_MIGR &&
728		    mvol->migr_type != INTEL_MT_REPAIR) {
729			G_RAID_DEBUG(1, "Intel metadata has unsupported"
730			    " migration type %d", mvol->migr_type);
731			free(meta, M_MD_INTEL);
732			return (NULL);
733		}
734		/* Deny general migrations except SINGLE->RAID1. */
735		if (mvol->migr_state &&
736		    mvol->migr_type == INTEL_MT_GEN_MIGR) {
737			mmap = intel_get_map(mvol, 0);
738			mmap1 = intel_get_map(mvol, 1);
739			if (mmap1->total_disks != 1 ||
740			    mmap->type != INTEL_T_RAID1 ||
741			    mmap->total_disks != 2 ||
742			    mmap->offset != mmap1->offset ||
743			    mmap->disk_sectors != mmap1->disk_sectors ||
744			    mmap->total_domains != mmap->total_disks ||
745			    mmap->offset_hi != mmap1->offset_hi ||
746			    mmap->disk_sectors_hi != mmap1->disk_sectors_hi ||
747			    (mmap->disk_idx[0] != mmap1->disk_idx[0] &&
748			     mmap->disk_idx[0] != mmap1->disk_idx[1])) {
749				G_RAID_DEBUG(1, "Intel metadata has unsupported"
750				    " variant of general migration");
751				free(meta, M_MD_INTEL);
752				return (NULL);
753			}
754		}
755	}
756
757	return (meta);
758}
759
760static int
761intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta)
762{
763	struct g_provider *pp;
764	char *buf;
765	int error, i, sectors;
766	uint32_t checksum, *ptr;
767
768	pp = cp->provider;
769
770	/* Recalculate checksum for case if metadata were changed. */
771	meta->checksum = 0;
772	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
773	    i < (meta->config_size / sizeof(uint32_t)); i++) {
774		checksum += *ptr++;
775	}
776	meta->checksum = checksum;
777
778	/* Create and fill buffer. */
779	sectors = howmany(meta->config_size, pp->sectorsize);
780	buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
781	if (sectors > 1) {
782		memcpy(buf, ((char *)meta) + pp->sectorsize,
783		    (sectors - 1) * pp->sectorsize);
784	}
785	memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize);
786
787	error = g_write_data(cp,
788	    pp->mediasize - pp->sectorsize * (1 + sectors),
789	    buf, pp->sectorsize * sectors);
790	if (error != 0) {
791		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
792		    pp->name, error);
793	}
794
795	free(buf, M_MD_INTEL);
796	return (error);
797}
798
799static int
800intel_meta_erase(struct g_consumer *cp)
801{
802	struct g_provider *pp;
803	char *buf;
804	int error;
805
806	pp = cp->provider;
807	buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
808	error = g_write_data(cp,
809	    pp->mediasize - 2 * pp->sectorsize,
810	    buf, pp->sectorsize);
811	if (error != 0) {
812		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
813		    pp->name, error);
814	}
815	free(buf, M_MD_INTEL);
816	return (error);
817}
818
819static int
820intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d)
821{
822	struct intel_raid_conf *meta;
823	int error;
824
825	/* Fill anchor and single disk. */
826	meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO);
827	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
828	memcpy(&meta->version[0], INTEL_VERSION_1000,
829	    sizeof(INTEL_VERSION_1000) - 1);
830	meta->config_size = INTEL_MAX_MD_SIZE(1);
831	meta->config_id = meta->orig_config_id = arc4random();
832	meta->generation = 1;
833	meta->total_disks = 1;
834	meta->disk[0] = *d;
835	error = intel_meta_write(cp, meta);
836	free(meta, M_MD_INTEL);
837	return (error);
838}
839
840static struct g_raid_disk *
841g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
842{
843	struct g_raid_disk	*disk;
844	struct g_raid_md_intel_perdisk *pd;
845
846	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
847		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
848		if (pd->pd_disk_pos == id)
849			break;
850	}
851	return (disk);
852}
853
854static int
855g_raid_md_intel_supported(int level, int qual, int disks, int force)
856{
857
858	switch (level) {
859	case G_RAID_VOLUME_RL_RAID0:
860		if (disks < 1)
861			return (0);
862		if (!force && (disks < 2 || disks > 6))
863			return (0);
864		break;
865	case G_RAID_VOLUME_RL_RAID1:
866		if (disks < 1)
867			return (0);
868		if (!force && (disks != 2))
869			return (0);
870		break;
871	case G_RAID_VOLUME_RL_RAID1E:
872		if (disks < 2)
873			return (0);
874		if (!force && (disks != 4))
875			return (0);
876		break;
877	case G_RAID_VOLUME_RL_RAID5:
878		if (disks < 3)
879			return (0);
880		if (!force && disks > 6)
881			return (0);
882		if (qual != G_RAID_VOLUME_RLQ_R5LA)
883			return (0);
884		break;
885	default:
886		return (0);
887	}
888	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
889		return (0);
890	return (1);
891}
892
893static struct g_raid_volume *
894g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
895{
896	struct g_raid_volume	*mvol;
897	struct g_raid_md_intel_pervolume *pv;
898
899	TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) {
900		pv = mvol->v_md_data;
901		if (pv->pv_volume_pos == id)
902			break;
903	}
904	return (mvol);
905}
906
907static int
908g_raid_md_intel_start_disk(struct g_raid_disk *disk)
909{
910	struct g_raid_softc *sc;
911	struct g_raid_subdisk *sd, *tmpsd;
912	struct g_raid_disk *olddisk, *tmpdisk;
913	struct g_raid_md_object *md;
914	struct g_raid_md_intel_object *mdi;
915	struct g_raid_md_intel_pervolume *pv;
916	struct g_raid_md_intel_perdisk *pd, *oldpd;
917	struct intel_raid_conf *meta;
918	struct intel_raid_vol *mvol;
919	struct intel_raid_map *mmap0, *mmap1;
920	int disk_pos, resurrection = 0, migr_global, i;
921
922	sc = disk->d_softc;
923	md = sc->sc_md;
924	mdi = (struct g_raid_md_intel_object *)md;
925	meta = mdi->mdio_meta;
926	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
927	olddisk = NULL;
928
929	/* Find disk position in metadata by its serial. */
930	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
931	if (disk_pos < 0) {
932		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
933		/* Failed stale disk is useless for us. */
934		if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) &&
935		    !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) {
936			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
937			return (0);
938		}
939		/* If we are in the start process, that's all for now. */
940		if (!mdi->mdio_started)
941			goto nofit;
942		/*
943		 * If we have already started - try to get use of the disk.
944		 * Try to replace OFFLINE disks first, then FAILED.
945		 */
946		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
947			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
948			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
949				continue;
950			/* Make sure this disk is big enough. */
951			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
952				off_t disk_sectors =
953				    intel_get_disk_sectors(&pd->pd_disk_meta);
954
955				if (sd->sd_offset + sd->sd_size + 4096 >
956				    disk_sectors * 512) {
957					G_RAID_DEBUG1(1, sc,
958					    "Disk too small (%llu < %llu)",
959					    (unsigned long long)
960					    disk_sectors * 512,
961					    (unsigned long long)
962					    sd->sd_offset + sd->sd_size + 4096);
963					break;
964				}
965			}
966			if (sd != NULL)
967				continue;
968			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
969				olddisk = tmpdisk;
970				break;
971			} else if (olddisk == NULL)
972				olddisk = tmpdisk;
973		}
974		if (olddisk == NULL) {
975nofit:
976			if (pd->pd_disk_meta.flags & INTEL_F_SPARE) {
977				g_raid_change_disk_state(disk,
978				    G_RAID_DISK_S_SPARE);
979				return (1);
980			} else {
981				g_raid_change_disk_state(disk,
982				    G_RAID_DISK_S_STALE);
983				return (0);
984			}
985		}
986		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
987		disk_pos = oldpd->pd_disk_pos;
988		resurrection = 1;
989	}
990
991	if (olddisk == NULL) {
992		/* Find placeholder by position. */
993		olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
994		if (olddisk == NULL)
995			panic("No disk at position %d!", disk_pos);
996		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
997			G_RAID_DEBUG1(1, sc, "More than one disk for pos %d",
998			    disk_pos);
999			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
1000			return (0);
1001		}
1002		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
1003	}
1004
1005	/* Replace failed disk or placeholder with new disk. */
1006	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
1007		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
1008		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1009		sd->sd_disk = disk;
1010	}
1011	oldpd->pd_disk_pos = -2;
1012	pd->pd_disk_pos = disk_pos;
1013
1014	/* If it was placeholder -- destroy it. */
1015	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
1016		g_raid_destroy_disk(olddisk);
1017	} else {
1018		/* Otherwise, make it STALE_FAILED. */
1019		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
1020		/* Update global metadata just in case. */
1021		memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta,
1022		    sizeof(struct intel_raid_disk));
1023	}
1024
1025	/* Welcome the new disk. */
1026	if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
1027	    !(pd->pd_disk_meta.flags & INTEL_F_SPARE))
1028		g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED);
1029	else if (resurrection)
1030		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
1031	else if (meta->disk[disk_pos].flags & INTEL_F_FAILED)
1032		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
1033	else if (meta->disk[disk_pos].flags & INTEL_F_SPARE)
1034		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1035	else
1036		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
1037	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1038		pv = sd->sd_volume->v_md_data;
1039		mvol = intel_get_volume(meta, pv->pv_volume_pos);
1040		mmap0 = intel_get_map(mvol, 0);
1041		if (mvol->migr_state)
1042			mmap1 = intel_get_map(mvol, 1);
1043		else
1044			mmap1 = mmap0;
1045
1046		migr_global = 1;
1047		for (i = 0; i < mmap0->total_disks; i++) {
1048			if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 &&
1049			    (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0)
1050				migr_global = 0;
1051		}
1052
1053		if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
1054		    !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) {
1055			/* Disabled disk, useless. */
1056			g_raid_change_subdisk_state(sd,
1057			    G_RAID_SUBDISK_S_NONE);
1058		} else if (resurrection) {
1059			/* Stale disk, almost same as new. */
1060			g_raid_change_subdisk_state(sd,
1061			    G_RAID_SUBDISK_S_NEW);
1062		} else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) {
1063			/* Failed disk, almost useless. */
1064			g_raid_change_subdisk_state(sd,
1065			    G_RAID_SUBDISK_S_FAILED);
1066		} else if (mvol->migr_state == 0) {
1067			if (mmap0->status == INTEL_S_UNINITIALIZED &&
1068			    (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) {
1069				/* Freshly created uninitialized volume. */
1070				g_raid_change_subdisk_state(sd,
1071				    G_RAID_SUBDISK_S_UNINITIALIZED);
1072			} else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1073				/* Freshly inserted disk. */
1074				g_raid_change_subdisk_state(sd,
1075				    G_RAID_SUBDISK_S_NEW);
1076			} else if (mvol->dirty && (!pv->pv_cng ||
1077			    pv->pv_cng_master_disk != disk_pos)) {
1078				/* Dirty volume (unclean shutdown). */
1079				g_raid_change_subdisk_state(sd,
1080				    G_RAID_SUBDISK_S_STALE);
1081			} else {
1082				/* Up to date disk. */
1083				g_raid_change_subdisk_state(sd,
1084				    G_RAID_SUBDISK_S_ACTIVE);
1085			}
1086		} else if (mvol->migr_type == INTEL_MT_INIT ||
1087			   mvol->migr_type == INTEL_MT_REBUILD) {
1088			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1089				/* Freshly inserted disk. */
1090				g_raid_change_subdisk_state(sd,
1091				    G_RAID_SUBDISK_S_NEW);
1092			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1093				/* Rebuilding disk. */
1094				g_raid_change_subdisk_state(sd,
1095				    G_RAID_SUBDISK_S_REBUILD);
1096				if (mvol->dirty) {
1097					sd->sd_rebuild_pos = 0;
1098				} else {
1099					sd->sd_rebuild_pos =
1100					    intel_get_vol_curr_migr_unit(mvol) *
1101					    sd->sd_volume->v_strip_size *
1102					    mmap0->total_domains;
1103				}
1104			} else if (mvol->migr_type == INTEL_MT_INIT &&
1105			    migr_global) {
1106				/* Freshly created uninitialized volume. */
1107				g_raid_change_subdisk_state(sd,
1108				    G_RAID_SUBDISK_S_UNINITIALIZED);
1109			} else if (mvol->dirty && (!pv->pv_cng ||
1110			    pv->pv_cng_master_disk != disk_pos)) {
1111				/* Dirty volume (unclean shutdown). */
1112				g_raid_change_subdisk_state(sd,
1113				    G_RAID_SUBDISK_S_STALE);
1114			} else {
1115				/* Up to date disk. */
1116				g_raid_change_subdisk_state(sd,
1117				    G_RAID_SUBDISK_S_ACTIVE);
1118			}
1119		} else if (mvol->migr_type == INTEL_MT_VERIFY ||
1120			   mvol->migr_type == INTEL_MT_REPAIR) {
1121			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1122				/* Freshly inserted disk. */
1123				g_raid_change_subdisk_state(sd,
1124				    G_RAID_SUBDISK_S_NEW);
1125			} else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) ||
1126			    migr_global) {
1127				/* Resyncing disk. */
1128				g_raid_change_subdisk_state(sd,
1129				    G_RAID_SUBDISK_S_RESYNC);
1130				if (mvol->dirty) {
1131					sd->sd_rebuild_pos = 0;
1132				} else {
1133					sd->sd_rebuild_pos =
1134					    intel_get_vol_curr_migr_unit(mvol) *
1135					    sd->sd_volume->v_strip_size *
1136					    mmap0->total_domains;
1137				}
1138			} else if (mvol->dirty) {
1139				/* Dirty volume (unclean shutdown). */
1140				g_raid_change_subdisk_state(sd,
1141				    G_RAID_SUBDISK_S_STALE);
1142			} else {
1143				/* Up to date disk. */
1144				g_raid_change_subdisk_state(sd,
1145				    G_RAID_SUBDISK_S_ACTIVE);
1146			}
1147		} else if (mvol->migr_type == INTEL_MT_GEN_MIGR) {
1148			if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) {
1149				/* Freshly inserted disk. */
1150				g_raid_change_subdisk_state(sd,
1151				    G_RAID_SUBDISK_S_NEW);
1152			} else {
1153				/* Up to date disk. */
1154				g_raid_change_subdisk_state(sd,
1155				    G_RAID_SUBDISK_S_ACTIVE);
1156			}
1157		}
1158		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1159		    G_RAID_EVENT_SUBDISK);
1160	}
1161
1162	/* Update status of our need for spare. */
1163	if (mdi->mdio_started) {
1164		mdi->mdio_incomplete =
1165		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1166		     g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) <
1167		     meta->total_disks);
1168	}
1169
1170	return (resurrection);
1171}
1172
1173static void
1174g_disk_md_intel_retaste(void *arg, int pending)
1175{
1176
1177	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
1178	g_retaste(&g_raid_class);
1179	free(arg, M_MD_INTEL);
1180}
1181
1182static void
1183g_raid_md_intel_refill(struct g_raid_softc *sc)
1184{
1185	struct g_raid_md_object *md;
1186	struct g_raid_md_intel_object *mdi;
1187	struct intel_raid_conf *meta;
1188	struct g_raid_disk *disk;
1189	struct task *task;
1190	int update, na;
1191
1192	md = sc->sc_md;
1193	mdi = (struct g_raid_md_intel_object *)md;
1194	meta = mdi->mdio_meta;
1195	update = 0;
1196	do {
1197		/* Make sure we miss anything. */
1198		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1199		    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED);
1200		if (na == meta->total_disks)
1201			break;
1202
1203		G_RAID_DEBUG1(1, md->mdo_softc,
1204		    "Array is not complete (%d of %d), "
1205		    "trying to refill.", na, meta->total_disks);
1206
1207		/* Try to get use some of STALE disks. */
1208		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1209			if (disk->d_state == G_RAID_DISK_S_STALE) {
1210				update += g_raid_md_intel_start_disk(disk);
1211				if (disk->d_state == G_RAID_DISK_S_ACTIVE ||
1212				    disk->d_state == G_RAID_DISK_S_DISABLED)
1213					break;
1214			}
1215		}
1216		if (disk != NULL)
1217			continue;
1218
1219		/* Try to get use some of SPARE disks. */
1220		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1221			if (disk->d_state == G_RAID_DISK_S_SPARE) {
1222				update += g_raid_md_intel_start_disk(disk);
1223				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
1224					break;
1225			}
1226		}
1227	} while (disk != NULL);
1228
1229	/* Write new metadata if we changed something. */
1230	if (update) {
1231		g_raid_md_write_intel(md, NULL, NULL, NULL);
1232		meta = mdi->mdio_meta;
1233	}
1234
1235	/* Update status of our need for spare. */
1236	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1237	    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks);
1238
1239	/* Request retaste hoping to find spare. */
1240	if (mdi->mdio_incomplete) {
1241		task = malloc(sizeof(struct task),
1242		    M_MD_INTEL, M_WAITOK | M_ZERO);
1243		TASK_INIT(task, 0, g_disk_md_intel_retaste, task);
1244		taskqueue_enqueue(taskqueue_swi, task);
1245	}
1246}
1247
1248static void
1249g_raid_md_intel_start(struct g_raid_softc *sc)
1250{
1251	struct g_raid_md_object *md;
1252	struct g_raid_md_intel_object *mdi;
1253	struct g_raid_md_intel_pervolume *pv;
1254	struct g_raid_md_intel_perdisk *pd;
1255	struct intel_raid_conf *meta;
1256	struct intel_raid_vol *mvol;
1257	struct intel_raid_map *mmap;
1258	struct g_raid_volume *vol;
1259	struct g_raid_subdisk *sd;
1260	struct g_raid_disk *disk;
1261	int i, j, disk_pos;
1262
1263	md = sc->sc_md;
1264	mdi = (struct g_raid_md_intel_object *)md;
1265	meta = mdi->mdio_meta;
1266
1267	/* Create volumes and subdisks. */
1268	for (i = 0; i < meta->total_volumes; i++) {
1269		mvol = intel_get_volume(meta, i);
1270		mmap = intel_get_map(mvol, 0);
1271		vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1);
1272		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1273		pv->pv_volume_pos = i;
1274		pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0;
1275		pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0;
1276		if (mvol->cng_master_disk < mmap->total_disks)
1277			pv->pv_cng_master_disk = mvol->cng_master_disk;
1278		vol->v_md_data = pv;
1279		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1280		if (mmap->type == INTEL_T_RAID0)
1281			vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
1282		else if (mmap->type == INTEL_T_RAID1 &&
1283		    mmap->total_domains >= 2 &&
1284		    mmap->total_domains <= mmap->total_disks) {
1285			/* Assume total_domains is correct. */
1286			if (mmap->total_domains == mmap->total_disks)
1287				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1288			else
1289				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1290		} else if (mmap->type == INTEL_T_RAID1) {
1291			/* total_domains looks wrong. */
1292			if (mmap->total_disks <= 2)
1293				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1294			else
1295				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1296		} else if (mmap->type == INTEL_T_RAID5) {
1297			vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
1298			vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
1299		} else
1300			vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1301		vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ
1302		vol->v_disks_count = mmap->total_disks;
1303		vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ
1304		vol->v_sectorsize = 512; //ZZZ
1305		for (j = 0; j < vol->v_disks_count; j++) {
1306			sd = &vol->v_subdisks[j];
1307			sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ
1308			sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ
1309		}
1310		g_raid_start_volume(vol);
1311	}
1312
1313	/* Create disk placeholders to store data for later writing. */
1314	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
1315		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1316		pd->pd_disk_pos = disk_pos;
1317		pd->pd_disk_meta = meta->disk[disk_pos];
1318		disk = g_raid_create_disk(sc);
1319		disk->d_md_data = (void *)pd;
1320		disk->d_state = G_RAID_DISK_S_OFFLINE;
1321		for (i = 0; i < meta->total_volumes; i++) {
1322			mvol = intel_get_volume(meta, i);
1323			mmap = intel_get_map(mvol, 0);
1324			for (j = 0; j < mmap->total_disks; j++) {
1325				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
1326					break;
1327			}
1328			if (j == mmap->total_disks)
1329				continue;
1330			vol = g_raid_md_intel_get_volume(sc, i);
1331			sd = &vol->v_subdisks[j];
1332			sd->sd_disk = disk;
1333			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1334		}
1335	}
1336
1337	/* Make all disks found till the moment take their places. */
1338	do {
1339		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1340			if (disk->d_state == G_RAID_DISK_S_NONE) {
1341				g_raid_md_intel_start_disk(disk);
1342				break;
1343			}
1344		}
1345	} while (disk != NULL);
1346
1347	mdi->mdio_started = 1;
1348	G_RAID_DEBUG1(0, sc, "Array started.");
1349	g_raid_md_write_intel(md, NULL, NULL, NULL);
1350
1351	/* Pickup any STALE/SPARE disks to refill array if needed. */
1352	g_raid_md_intel_refill(sc);
1353
1354	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1355		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1356		    G_RAID_EVENT_VOLUME);
1357	}
1358
1359	callout_stop(&mdi->mdio_start_co);
1360	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
1361	root_mount_rel(mdi->mdio_rootmount);
1362	mdi->mdio_rootmount = NULL;
1363}
1364
1365static void
1366g_raid_md_intel_new_disk(struct g_raid_disk *disk)
1367{
1368	struct g_raid_softc *sc;
1369	struct g_raid_md_object *md;
1370	struct g_raid_md_intel_object *mdi;
1371	struct intel_raid_conf *pdmeta;
1372	struct g_raid_md_intel_perdisk *pd;
1373
1374	sc = disk->d_softc;
1375	md = sc->sc_md;
1376	mdi = (struct g_raid_md_intel_object *)md;
1377	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1378	pdmeta = pd->pd_meta;
1379
1380	if (mdi->mdio_started) {
1381		if (g_raid_md_intel_start_disk(disk))
1382			g_raid_md_write_intel(md, NULL, NULL, NULL);
1383	} else {
1384		/* If we haven't started yet - check metadata freshness. */
1385		if (mdi->mdio_meta == NULL ||
1386		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
1387			G_RAID_DEBUG1(1, sc, "Newer disk");
1388			if (mdi->mdio_meta != NULL)
1389				free(mdi->mdio_meta, M_MD_INTEL);
1390			mdi->mdio_meta = intel_meta_copy(pdmeta);
1391			mdi->mdio_generation = mdi->mdio_meta->generation;
1392			mdi->mdio_disks_present = 1;
1393		} else if (pdmeta->generation == mdi->mdio_generation) {
1394			mdi->mdio_disks_present++;
1395			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1396			    mdi->mdio_disks_present,
1397			    mdi->mdio_meta->total_disks);
1398		} else {
1399			G_RAID_DEBUG1(1, sc, "Older disk");
1400		}
1401		/* If we collected all needed disks - start array. */
1402		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
1403			g_raid_md_intel_start(sc);
1404	}
1405}
1406
1407static void
1408g_raid_intel_go(void *arg)
1409{
1410	struct g_raid_softc *sc;
1411	struct g_raid_md_object *md;
1412	struct g_raid_md_intel_object *mdi;
1413
1414	sc = arg;
1415	md = sc->sc_md;
1416	mdi = (struct g_raid_md_intel_object *)md;
1417	if (!mdi->mdio_started) {
1418		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
1419		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
1420	}
1421}
1422
1423static int
1424g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp,
1425    struct g_geom **gp)
1426{
1427	struct g_raid_softc *sc;
1428	struct g_raid_md_intel_object *mdi;
1429	char name[16];
1430
1431	mdi = (struct g_raid_md_intel_object *)md;
1432	mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random();
1433	mdi->mdio_generation = 0;
1434	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
1435	sc = g_raid_create_node(mp, name, md);
1436	if (sc == NULL)
1437		return (G_RAID_MD_TASTE_FAIL);
1438	md->mdo_softc = sc;
1439	*gp = sc->sc_geom;
1440	return (G_RAID_MD_TASTE_NEW);
1441}
1442
1443/*
1444 * Return the last N characters of the serial label.  The Linux and
1445 * ataraid(7) code always uses the last 16 characters of the label to
1446 * store into the Intel meta format.  Generalize this to N characters
1447 * since that's easy.  Labels can be up to 20 characters for SATA drives
1448 * and up 251 characters for SAS drives.  Since intel controllers don't
1449 * support SAS drives, just stick with the SATA limits for stack friendliness.
1450 */
1451static int
1452g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen)
1453{
1454	char serial_buffer[DISK_IDENT_SIZE];
1455	int len, error;
1456
1457	len = sizeof(serial_buffer);
1458	error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer);
1459	if (error != 0)
1460		return (error);
1461	len = strlen(serial_buffer);
1462	if (len > serlen)
1463		len -= serlen;
1464	else
1465		len = 0;
1466	strncpy(serial, serial_buffer + len, serlen);
1467	return (0);
1468}
1469
1470static int
1471g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp,
1472                              struct g_consumer *cp, struct g_geom **gp)
1473{
1474	struct g_consumer *rcp;
1475	struct g_provider *pp;
1476	struct g_raid_md_intel_object *mdi, *mdi1;
1477	struct g_raid_softc *sc;
1478	struct g_raid_disk *disk;
1479	struct intel_raid_conf *meta;
1480	struct g_raid_md_intel_perdisk *pd;
1481	struct g_geom *geom;
1482	int error, disk_pos, result, spare, len;
1483	char serial[INTEL_SERIAL_LEN];
1484	char name[16];
1485	uint16_t vendor;
1486
1487	G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name);
1488	mdi = (struct g_raid_md_intel_object *)md;
1489	pp = cp->provider;
1490
1491	/* Read metadata from device. */
1492	meta = NULL;
1493	disk_pos = 0;
1494	g_topology_unlock();
1495	error = g_raid_md_get_label(cp, serial, sizeof(serial));
1496	if (error != 0) {
1497		G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).",
1498		    pp->name, error);
1499		goto fail2;
1500	}
1501	vendor = 0xffff;
1502	len = sizeof(vendor);
1503	if (pp->geom->rank == 1)
1504		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1505	meta = intel_meta_read(cp);
1506	g_topology_lock();
1507	if (meta == NULL) {
1508		if (g_raid_aggressive_spare) {
1509			if (vendor != 0x8086) {
1510				G_RAID_DEBUG(1,
1511				    "Intel vendor mismatch 0x%04x != 0x8086",
1512				    vendor);
1513			} else {
1514				G_RAID_DEBUG(1,
1515				    "No Intel metadata, forcing spare.");
1516				spare = 2;
1517				goto search;
1518			}
1519		}
1520		return (G_RAID_MD_TASTE_FAIL);
1521	}
1522
1523	/* Check this disk position in obtained metadata. */
1524	disk_pos = intel_meta_find_disk(meta, serial);
1525	if (disk_pos < 0) {
1526		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
1527		goto fail1;
1528	}
1529	if (intel_get_disk_sectors(&meta->disk[disk_pos]) !=
1530	    (pp->mediasize / pp->sectorsize)) {
1531		G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju",
1532		    intel_get_disk_sectors(&meta->disk[disk_pos]),
1533		    (off_t)(pp->mediasize / pp->sectorsize));
1534		goto fail1;
1535	}
1536
1537	G_RAID_DEBUG(1, "Intel disk position %d", disk_pos);
1538	spare = meta->disk[disk_pos].flags & INTEL_F_SPARE;
1539
1540search:
1541	/* Search for matching node. */
1542	sc = NULL;
1543	mdi1 = NULL;
1544	LIST_FOREACH(geom, &mp->geom, geom) {
1545		sc = geom->softc;
1546		if (sc == NULL)
1547			continue;
1548		if (sc->sc_stopping != 0)
1549			continue;
1550		if (sc->sc_md->mdo_class != md->mdo_class)
1551			continue;
1552		mdi1 = (struct g_raid_md_intel_object *)sc->sc_md;
1553		if (spare) {
1554			if (mdi1->mdio_incomplete)
1555				break;
1556		} else {
1557			if (mdi1->mdio_config_id == meta->config_id)
1558				break;
1559		}
1560	}
1561
1562	/* Found matching node. */
1563	if (geom != NULL) {
1564		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1565		result = G_RAID_MD_TASTE_EXISTING;
1566
1567	} else if (spare) { /* Not found needy node -- left for later. */
1568		G_RAID_DEBUG(1, "Spare is not needed at this time");
1569		goto fail1;
1570
1571	} else { /* Not found matching node -- create one. */
1572		result = G_RAID_MD_TASTE_NEW;
1573		mdi->mdio_config_id = meta->config_id;
1574		mdi->mdio_orig_config_id = meta->orig_config_id;
1575		snprintf(name, sizeof(name), "Intel-%08x", meta->config_id);
1576		sc = g_raid_create_node(mp, name, md);
1577		md->mdo_softc = sc;
1578		geom = sc->sc_geom;
1579		callout_init(&mdi->mdio_start_co, 1);
1580		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
1581		    g_raid_intel_go, sc);
1582		mdi->mdio_rootmount = root_mount_hold("GRAID-Intel");
1583		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
1584	}
1585
1586	/* There is no return after this point, so we close passed consumer. */
1587	g_access(cp, -1, 0, 0);
1588
1589	rcp = g_new_consumer(geom);
1590	rcp->flags |= G_CF_DIRECT_RECEIVE;
1591	g_attach(rcp, pp);
1592	if (g_access(rcp, 1, 1, 1) != 0)
1593		; //goto fail1;
1594
1595	g_topology_unlock();
1596	sx_xlock(&sc->sc_lock);
1597
1598	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1599	pd->pd_meta = meta;
1600	pd->pd_disk_pos = -1;
1601	if (spare == 2) {
1602		memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN);
1603		intel_set_disk_sectors(&pd->pd_disk_meta,
1604		    pp->mediasize / pp->sectorsize);
1605		pd->pd_disk_meta.id = 0;
1606		pd->pd_disk_meta.flags = INTEL_F_SPARE;
1607	} else {
1608		pd->pd_disk_meta = meta->disk[disk_pos];
1609	}
1610	disk = g_raid_create_disk(sc);
1611	disk->d_md_data = (void *)pd;
1612	disk->d_consumer = rcp;
1613	rcp->private = disk;
1614
1615	g_raid_get_disk_info(disk);
1616
1617	g_raid_md_intel_new_disk(disk);
1618
1619	sx_xunlock(&sc->sc_lock);
1620	g_topology_lock();
1621	*gp = geom;
1622	return (result);
1623fail2:
1624	g_topology_lock();
1625fail1:
1626	free(meta, M_MD_INTEL);
1627	return (G_RAID_MD_TASTE_FAIL);
1628}
1629
1630static int
1631g_raid_md_event_intel(struct g_raid_md_object *md,
1632    struct g_raid_disk *disk, u_int event)
1633{
1634	struct g_raid_softc *sc;
1635	struct g_raid_subdisk *sd;
1636	struct g_raid_md_intel_object *mdi;
1637	struct g_raid_md_intel_perdisk *pd;
1638
1639	sc = md->mdo_softc;
1640	mdi = (struct g_raid_md_intel_object *)md;
1641	if (disk == NULL) {
1642		switch (event) {
1643		case G_RAID_NODE_E_START:
1644			if (!mdi->mdio_started)
1645				g_raid_md_intel_start(sc);
1646			return (0);
1647		}
1648		return (-1);
1649	}
1650	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1651	switch (event) {
1652	case G_RAID_DISK_E_DISCONNECTED:
1653		/* If disk was assigned, just update statuses. */
1654		if (pd->pd_disk_pos >= 0) {
1655			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1656			if (disk->d_consumer) {
1657				g_raid_kill_consumer(sc, disk->d_consumer);
1658				disk->d_consumer = NULL;
1659			}
1660			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1661				g_raid_change_subdisk_state(sd,
1662				    G_RAID_SUBDISK_S_NONE);
1663				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
1664				    G_RAID_EVENT_SUBDISK);
1665			}
1666		} else {
1667			/* Otherwise -- delete. */
1668			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1669			g_raid_destroy_disk(disk);
1670		}
1671
1672		/* Write updated metadata to all disks. */
1673		g_raid_md_write_intel(md, NULL, NULL, NULL);
1674
1675		/* Check if anything left except placeholders. */
1676		if (g_raid_ndisks(sc, -1) ==
1677		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
1678			g_raid_destroy_node(sc, 0);
1679		else
1680			g_raid_md_intel_refill(sc);
1681		return (0);
1682	}
1683	return (-2);
1684}
1685
1686static int
1687g_raid_md_ctl_intel(struct g_raid_md_object *md,
1688    struct gctl_req *req)
1689{
1690	struct g_raid_softc *sc;
1691	struct g_raid_volume *vol, *vol1;
1692	struct g_raid_subdisk *sd;
1693	struct g_raid_disk *disk;
1694	struct g_raid_md_intel_object *mdi;
1695	struct g_raid_md_intel_pervolume *pv;
1696	struct g_raid_md_intel_perdisk *pd;
1697	struct g_consumer *cp;
1698	struct g_provider *pp;
1699	char arg[16], serial[INTEL_SERIAL_LEN];
1700	const char *nodename, *verb, *volname, *levelname, *diskname;
1701	char *tmp;
1702	int *nargs, *force;
1703	off_t off, size, sectorsize, strip, disk_sectors;
1704	intmax_t *sizearg, *striparg;
1705	int numdisks, i, len, level, qual, update;
1706	int error;
1707
1708	sc = md->mdo_softc;
1709	mdi = (struct g_raid_md_intel_object *)md;
1710	verb = gctl_get_param(req, "verb", NULL);
1711	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1712	error = 0;
1713	if (strcmp(verb, "label") == 0) {
1714		if (*nargs < 4) {
1715			gctl_error(req, "Invalid number of arguments.");
1716			return (-1);
1717		}
1718		volname = gctl_get_asciiparam(req, "arg1");
1719		if (volname == NULL) {
1720			gctl_error(req, "No volume name.");
1721			return (-2);
1722		}
1723		levelname = gctl_get_asciiparam(req, "arg2");
1724		if (levelname == NULL) {
1725			gctl_error(req, "No RAID level.");
1726			return (-3);
1727		}
1728		if (strcasecmp(levelname, "RAID5") == 0)
1729			levelname = "RAID5-LA";
1730		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1731			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1732			return (-4);
1733		}
1734		numdisks = *nargs - 3;
1735		force = gctl_get_paraml(req, "force", sizeof(*force));
1736		if (!g_raid_md_intel_supported(level, qual, numdisks,
1737		    force ? *force : 0)) {
1738			gctl_error(req, "Unsupported RAID level "
1739			    "(0x%02x/0x%02x), or number of disks (%d).",
1740			    level, qual, numdisks);
1741			return (-5);
1742		}
1743
1744		/* Search for disks, connect them and probe. */
1745		size = 0x7fffffffffffffffllu;
1746		sectorsize = 0;
1747		for (i = 0; i < numdisks; i++) {
1748			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1749			diskname = gctl_get_asciiparam(req, arg);
1750			if (diskname == NULL) {
1751				gctl_error(req, "No disk name (%s).", arg);
1752				error = -6;
1753				break;
1754			}
1755			if (strcmp(diskname, "NONE") == 0) {
1756				cp = NULL;
1757				pp = NULL;
1758			} else {
1759				g_topology_lock();
1760				cp = g_raid_open_consumer(sc, diskname);
1761				if (cp == NULL) {
1762					gctl_error(req, "Can't open disk '%s'.",
1763					    diskname);
1764					g_topology_unlock();
1765					error = -7;
1766					break;
1767				}
1768				pp = cp->provider;
1769			}
1770			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1771			pd->pd_disk_pos = i;
1772			disk = g_raid_create_disk(sc);
1773			disk->d_md_data = (void *)pd;
1774			disk->d_consumer = cp;
1775			if (cp == NULL) {
1776				strcpy(&pd->pd_disk_meta.serial[0], "NONE");
1777				pd->pd_disk_meta.id = 0xffffffff;
1778				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
1779				continue;
1780			}
1781			cp->private = disk;
1782			g_topology_unlock();
1783
1784			error = g_raid_md_get_label(cp,
1785			    &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN);
1786			if (error != 0) {
1787				gctl_error(req,
1788				    "Can't get serial for provider '%s'.",
1789				    diskname);
1790				error = -8;
1791				break;
1792			}
1793
1794			g_raid_get_disk_info(disk);
1795
1796			intel_set_disk_sectors(&pd->pd_disk_meta,
1797			    pp->mediasize / pp->sectorsize);
1798			if (size > pp->mediasize)
1799				size = pp->mediasize;
1800			if (sectorsize < pp->sectorsize)
1801				sectorsize = pp->sectorsize;
1802			pd->pd_disk_meta.id = 0;
1803			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE;
1804		}
1805		if (error != 0)
1806			return (error);
1807
1808		if (sectorsize <= 0) {
1809			gctl_error(req, "Can't get sector size.");
1810			return (-8);
1811		}
1812
1813		/* Reserve some space for metadata. */
1814		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1815
1816		/* Handle size argument. */
1817		len = sizeof(*sizearg);
1818		sizearg = gctl_get_param(req, "size", &len);
1819		if (sizearg != NULL && len == sizeof(*sizearg) &&
1820		    *sizearg > 0) {
1821			if (*sizearg > size) {
1822				gctl_error(req, "Size too big %lld > %lld.",
1823				    (long long)*sizearg, (long long)size);
1824				return (-9);
1825			}
1826			size = *sizearg;
1827		}
1828
1829		/* Handle strip argument. */
1830		strip = 131072;
1831		len = sizeof(*striparg);
1832		striparg = gctl_get_param(req, "strip", &len);
1833		if (striparg != NULL && len == sizeof(*striparg) &&
1834		    *striparg > 0) {
1835			if (*striparg < sectorsize) {
1836				gctl_error(req, "Strip size too small.");
1837				return (-10);
1838			}
1839			if (*striparg % sectorsize != 0) {
1840				gctl_error(req, "Incorrect strip size.");
1841				return (-11);
1842			}
1843			if (strip > 65535 * sectorsize) {
1844				gctl_error(req, "Strip size too big.");
1845				return (-12);
1846			}
1847			strip = *striparg;
1848		}
1849
1850		/* Round size down to strip or sector. */
1851		if (level == G_RAID_VOLUME_RL_RAID1)
1852			size -= (size % sectorsize);
1853		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1854		    (numdisks & 1) != 0)
1855			size -= (size % (2 * strip));
1856		else
1857			size -= (size % strip);
1858		if (size <= 0) {
1859			gctl_error(req, "Size too small.");
1860			return (-13);
1861		}
1862
1863		/* We have all we need, create things: volume, ... */
1864		mdi->mdio_started = 1;
1865		vol = g_raid_create_volume(sc, volname, -1);
1866		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1867		pv->pv_volume_pos = 0;
1868		vol->v_md_data = pv;
1869		vol->v_raid_level = level;
1870		vol->v_raid_level_qualifier = qual;
1871		vol->v_strip_size = strip;
1872		vol->v_disks_count = numdisks;
1873		if (level == G_RAID_VOLUME_RL_RAID0)
1874			vol->v_mediasize = size * numdisks;
1875		else if (level == G_RAID_VOLUME_RL_RAID1)
1876			vol->v_mediasize = size;
1877		else if (level == G_RAID_VOLUME_RL_RAID5)
1878			vol->v_mediasize = size * (numdisks - 1);
1879		else { /* RAID1E */
1880			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1881			    strip;
1882		}
1883		vol->v_sectorsize = sectorsize;
1884		g_raid_start_volume(vol);
1885
1886		/* , and subdisks. */
1887		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1888			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1889			sd = &vol->v_subdisks[pd->pd_disk_pos];
1890			sd->sd_disk = disk;
1891			sd->sd_offset = 0;
1892			sd->sd_size = size;
1893			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1894			if (sd->sd_disk->d_consumer != NULL) {
1895				g_raid_change_disk_state(disk,
1896				    G_RAID_DISK_S_ACTIVE);
1897				if (level == G_RAID_VOLUME_RL_RAID5)
1898					g_raid_change_subdisk_state(sd,
1899					    G_RAID_SUBDISK_S_UNINITIALIZED);
1900				else
1901					g_raid_change_subdisk_state(sd,
1902					    G_RAID_SUBDISK_S_ACTIVE);
1903				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1904				    G_RAID_EVENT_SUBDISK);
1905			} else {
1906				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1907			}
1908		}
1909
1910		/* Write metadata based on created entities. */
1911		G_RAID_DEBUG1(0, sc, "Array started.");
1912		g_raid_md_write_intel(md, NULL, NULL, NULL);
1913
1914		/* Pickup any STALE/SPARE disks to refill array if needed. */
1915		g_raid_md_intel_refill(sc);
1916
1917		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1918		    G_RAID_EVENT_VOLUME);
1919		return (0);
1920	}
1921	if (strcmp(verb, "add") == 0) {
1922		if (*nargs != 3) {
1923			gctl_error(req, "Invalid number of arguments.");
1924			return (-1);
1925		}
1926		volname = gctl_get_asciiparam(req, "arg1");
1927		if (volname == NULL) {
1928			gctl_error(req, "No volume name.");
1929			return (-2);
1930		}
1931		levelname = gctl_get_asciiparam(req, "arg2");
1932		if (levelname == NULL) {
1933			gctl_error(req, "No RAID level.");
1934			return (-3);
1935		}
1936		if (strcasecmp(levelname, "RAID5") == 0)
1937			levelname = "RAID5-LA";
1938		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1939			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1940			return (-4);
1941		}
1942
1943		/* Look for existing volumes. */
1944		i = 0;
1945		vol1 = NULL;
1946		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1947			vol1 = vol;
1948			i++;
1949		}
1950		if (i > 1) {
1951			gctl_error(req, "Maximum two volumes supported.");
1952			return (-6);
1953		}
1954		if (vol1 == NULL) {
1955			gctl_error(req, "At least one volume must exist.");
1956			return (-7);
1957		}
1958
1959		numdisks = vol1->v_disks_count;
1960		force = gctl_get_paraml(req, "force", sizeof(*force));
1961		if (!g_raid_md_intel_supported(level, qual, numdisks,
1962		    force ? *force : 0)) {
1963			gctl_error(req, "Unsupported RAID level "
1964			    "(0x%02x/0x%02x), or number of disks (%d).",
1965			    level, qual, numdisks);
1966			return (-5);
1967		}
1968
1969		/* Collect info about present disks. */
1970		size = 0x7fffffffffffffffllu;
1971		sectorsize = 512;
1972		for (i = 0; i < numdisks; i++) {
1973			disk = vol1->v_subdisks[i].sd_disk;
1974			pd = (struct g_raid_md_intel_perdisk *)
1975			    disk->d_md_data;
1976			disk_sectors =
1977			    intel_get_disk_sectors(&pd->pd_disk_meta);
1978
1979			if (disk_sectors * 512 < size)
1980				size = disk_sectors * 512;
1981			if (disk->d_consumer != NULL &&
1982			    disk->d_consumer->provider != NULL &&
1983			    disk->d_consumer->provider->sectorsize >
1984			     sectorsize) {
1985				sectorsize =
1986				    disk->d_consumer->provider->sectorsize;
1987			}
1988		}
1989
1990		/* Reserve some space for metadata. */
1991		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1992
1993		/* Decide insert before or after. */
1994		sd = &vol1->v_subdisks[0];
1995		if (sd->sd_offset >
1996		    size - (sd->sd_offset + sd->sd_size)) {
1997			off = 0;
1998			size = sd->sd_offset;
1999		} else {
2000			off = sd->sd_offset + sd->sd_size;
2001			size = size - (sd->sd_offset + sd->sd_size);
2002		}
2003
2004		/* Handle strip argument. */
2005		strip = 131072;
2006		len = sizeof(*striparg);
2007		striparg = gctl_get_param(req, "strip", &len);
2008		if (striparg != NULL && len == sizeof(*striparg) &&
2009		    *striparg > 0) {
2010			if (*striparg < sectorsize) {
2011				gctl_error(req, "Strip size too small.");
2012				return (-10);
2013			}
2014			if (*striparg % sectorsize != 0) {
2015				gctl_error(req, "Incorrect strip size.");
2016				return (-11);
2017			}
2018			if (strip > 65535 * sectorsize) {
2019				gctl_error(req, "Strip size too big.");
2020				return (-12);
2021			}
2022			strip = *striparg;
2023		}
2024
2025		/* Round offset up to strip. */
2026		if (off % strip != 0) {
2027			size -= strip - off % strip;
2028			off += strip - off % strip;
2029		}
2030
2031		/* Handle size argument. */
2032		len = sizeof(*sizearg);
2033		sizearg = gctl_get_param(req, "size", &len);
2034		if (sizearg != NULL && len == sizeof(*sizearg) &&
2035		    *sizearg > 0) {
2036			if (*sizearg > size) {
2037				gctl_error(req, "Size too big %lld > %lld.",
2038				    (long long)*sizearg, (long long)size);
2039				return (-9);
2040			}
2041			size = *sizearg;
2042		}
2043
2044		/* Round size down to strip or sector. */
2045		if (level == G_RAID_VOLUME_RL_RAID1)
2046			size -= (size % sectorsize);
2047		else
2048			size -= (size % strip);
2049		if (size <= 0) {
2050			gctl_error(req, "Size too small.");
2051			return (-13);
2052		}
2053		if (size > 0xffffffffllu * sectorsize) {
2054			gctl_error(req, "Size too big.");
2055			return (-14);
2056		}
2057
2058		/* We have all we need, create things: volume, ... */
2059		vol = g_raid_create_volume(sc, volname, -1);
2060		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
2061		pv->pv_volume_pos = i;
2062		vol->v_md_data = pv;
2063		vol->v_raid_level = level;
2064		vol->v_raid_level_qualifier = qual;
2065		vol->v_strip_size = strip;
2066		vol->v_disks_count = numdisks;
2067		if (level == G_RAID_VOLUME_RL_RAID0)
2068			vol->v_mediasize = size * numdisks;
2069		else if (level == G_RAID_VOLUME_RL_RAID1)
2070			vol->v_mediasize = size;
2071		else if (level == G_RAID_VOLUME_RL_RAID5)
2072			vol->v_mediasize = size * (numdisks - 1);
2073		else { /* RAID1E */
2074			vol->v_mediasize = ((size * numdisks) / strip / 2) *
2075			    strip;
2076		}
2077		vol->v_sectorsize = sectorsize;
2078		g_raid_start_volume(vol);
2079
2080		/* , and subdisks. */
2081		for (i = 0; i < numdisks; i++) {
2082			disk = vol1->v_subdisks[i].sd_disk;
2083			sd = &vol->v_subdisks[i];
2084			sd->sd_disk = disk;
2085			sd->sd_offset = off;
2086			sd->sd_size = size;
2087			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
2088			if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2089				if (level == G_RAID_VOLUME_RL_RAID5)
2090					g_raid_change_subdisk_state(sd,
2091					    G_RAID_SUBDISK_S_UNINITIALIZED);
2092				else
2093					g_raid_change_subdisk_state(sd,
2094					    G_RAID_SUBDISK_S_ACTIVE);
2095				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
2096				    G_RAID_EVENT_SUBDISK);
2097			}
2098		}
2099
2100		/* Write metadata based on created entities. */
2101		g_raid_md_write_intel(md, NULL, NULL, NULL);
2102
2103		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
2104		    G_RAID_EVENT_VOLUME);
2105		return (0);
2106	}
2107	if (strcmp(verb, "delete") == 0) {
2108		nodename = gctl_get_asciiparam(req, "arg0");
2109		if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
2110			nodename = NULL;
2111
2112		/* Full node destruction. */
2113		if (*nargs == 1 && nodename != NULL) {
2114			/* Check if some volume is still open. */
2115			force = gctl_get_paraml(req, "force", sizeof(*force));
2116			if (force != NULL && *force == 0 &&
2117			    g_raid_nopens(sc) != 0) {
2118				gctl_error(req, "Some volume is still open.");
2119				return (-4);
2120			}
2121
2122			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2123				if (disk->d_consumer)
2124					intel_meta_erase(disk->d_consumer);
2125			}
2126			g_raid_destroy_node(sc, 0);
2127			return (0);
2128		}
2129
2130		/* Destroy specified volume. If it was last - all node. */
2131		if (*nargs > 2) {
2132			gctl_error(req, "Invalid number of arguments.");
2133			return (-1);
2134		}
2135		volname = gctl_get_asciiparam(req,
2136		    nodename != NULL ? "arg1" : "arg0");
2137		if (volname == NULL) {
2138			gctl_error(req, "No volume name.");
2139			return (-2);
2140		}
2141
2142		/* Search for volume. */
2143		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2144			if (strcmp(vol->v_name, volname) == 0)
2145				break;
2146			pp = vol->v_provider;
2147			if (pp == NULL)
2148				continue;
2149			if (strcmp(pp->name, volname) == 0)
2150				break;
2151			if (strncmp(pp->name, "raid/", 5) == 0 &&
2152			    strcmp(pp->name + 5, volname) == 0)
2153				break;
2154		}
2155		if (vol == NULL) {
2156			i = strtol(volname, &tmp, 10);
2157			if (verb != volname && tmp[0] == 0) {
2158				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2159					if (vol->v_global_id == i)
2160						break;
2161				}
2162			}
2163		}
2164		if (vol == NULL) {
2165			gctl_error(req, "Volume '%s' not found.", volname);
2166			return (-3);
2167		}
2168
2169		/* Check if volume is still open. */
2170		force = gctl_get_paraml(req, "force", sizeof(*force));
2171		if (force != NULL && *force == 0 &&
2172		    vol->v_provider_open != 0) {
2173			gctl_error(req, "Volume is still open.");
2174			return (-4);
2175		}
2176
2177		/* Destroy volume and potentially node. */
2178		i = 0;
2179		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
2180			i++;
2181		if (i >= 2) {
2182			g_raid_destroy_volume(vol);
2183			g_raid_md_write_intel(md, NULL, NULL, NULL);
2184		} else {
2185			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2186				if (disk->d_consumer)
2187					intel_meta_erase(disk->d_consumer);
2188			}
2189			g_raid_destroy_node(sc, 0);
2190		}
2191		return (0);
2192	}
2193	if (strcmp(verb, "remove") == 0 ||
2194	    strcmp(verb, "fail") == 0) {
2195		if (*nargs < 2) {
2196			gctl_error(req, "Invalid number of arguments.");
2197			return (-1);
2198		}
2199		for (i = 1; i < *nargs; i++) {
2200			snprintf(arg, sizeof(arg), "arg%d", i);
2201			diskname = gctl_get_asciiparam(req, arg);
2202			if (diskname == NULL) {
2203				gctl_error(req, "No disk name (%s).", arg);
2204				error = -2;
2205				break;
2206			}
2207			if (strncmp(diskname, _PATH_DEV, 5) == 0)
2208				diskname += 5;
2209
2210			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2211				if (disk->d_consumer != NULL &&
2212				    disk->d_consumer->provider != NULL &&
2213				    strcmp(disk->d_consumer->provider->name,
2214				     diskname) == 0)
2215					break;
2216			}
2217			if (disk == NULL) {
2218				gctl_error(req, "Disk '%s' not found.",
2219				    diskname);
2220				error = -3;
2221				break;
2222			}
2223
2224			if (strcmp(verb, "fail") == 0) {
2225				g_raid_md_fail_disk_intel(md, NULL, disk);
2226				continue;
2227			}
2228
2229			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2230
2231			/* Erase metadata on deleting disk. */
2232			intel_meta_erase(disk->d_consumer);
2233
2234			/* If disk was assigned, just update statuses. */
2235			if (pd->pd_disk_pos >= 0) {
2236				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
2237				g_raid_kill_consumer(sc, disk->d_consumer);
2238				disk->d_consumer = NULL;
2239				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2240					g_raid_change_subdisk_state(sd,
2241					    G_RAID_SUBDISK_S_NONE);
2242					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2243					    G_RAID_EVENT_SUBDISK);
2244				}
2245			} else {
2246				/* Otherwise -- delete. */
2247				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
2248				g_raid_destroy_disk(disk);
2249			}
2250		}
2251
2252		/* Write updated metadata to remaining disks. */
2253		g_raid_md_write_intel(md, NULL, NULL, NULL);
2254
2255		/* Check if anything left except placeholders. */
2256		if (g_raid_ndisks(sc, -1) ==
2257		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2258			g_raid_destroy_node(sc, 0);
2259		else
2260			g_raid_md_intel_refill(sc);
2261		return (error);
2262	}
2263	if (strcmp(verb, "insert") == 0) {
2264		if (*nargs < 2) {
2265			gctl_error(req, "Invalid number of arguments.");
2266			return (-1);
2267		}
2268		update = 0;
2269		for (i = 1; i < *nargs; i++) {
2270			/* Get disk name. */
2271			snprintf(arg, sizeof(arg), "arg%d", i);
2272			diskname = gctl_get_asciiparam(req, arg);
2273			if (diskname == NULL) {
2274				gctl_error(req, "No disk name (%s).", arg);
2275				error = -3;
2276				break;
2277			}
2278
2279			/* Try to find provider with specified name. */
2280			g_topology_lock();
2281			cp = g_raid_open_consumer(sc, diskname);
2282			if (cp == NULL) {
2283				gctl_error(req, "Can't open disk '%s'.",
2284				    diskname);
2285				g_topology_unlock();
2286				error = -4;
2287				break;
2288			}
2289			pp = cp->provider;
2290			g_topology_unlock();
2291
2292			/* Read disk serial. */
2293			error = g_raid_md_get_label(cp,
2294			    &serial[0], INTEL_SERIAL_LEN);
2295			if (error != 0) {
2296				gctl_error(req,
2297				    "Can't get serial for provider '%s'.",
2298				    diskname);
2299				g_raid_kill_consumer(sc, cp);
2300				error = -7;
2301				break;
2302			}
2303
2304			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
2305			pd->pd_disk_pos = -1;
2306
2307			disk = g_raid_create_disk(sc);
2308			disk->d_consumer = cp;
2309			disk->d_md_data = (void *)pd;
2310			cp->private = disk;
2311
2312			g_raid_get_disk_info(disk);
2313
2314			memcpy(&pd->pd_disk_meta.serial[0], &serial[0],
2315			    INTEL_SERIAL_LEN);
2316			intel_set_disk_sectors(&pd->pd_disk_meta,
2317			    pp->mediasize / pp->sectorsize);
2318			pd->pd_disk_meta.id = 0;
2319			pd->pd_disk_meta.flags = INTEL_F_SPARE;
2320
2321			/* Welcome the "new" disk. */
2322			update += g_raid_md_intel_start_disk(disk);
2323			if (disk->d_state == G_RAID_DISK_S_SPARE) {
2324				intel_meta_write_spare(cp, &pd->pd_disk_meta);
2325				g_raid_destroy_disk(disk);
2326			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2327				gctl_error(req, "Disk '%s' doesn't fit.",
2328				    diskname);
2329				g_raid_destroy_disk(disk);
2330				error = -8;
2331				break;
2332			}
2333		}
2334
2335		/* Write new metadata if we changed something. */
2336		if (update)
2337			g_raid_md_write_intel(md, NULL, NULL, NULL);
2338		return (error);
2339	}
2340	return (-100);
2341}
2342
2343static int
2344g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol,
2345    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2346{
2347	struct g_raid_softc *sc;
2348	struct g_raid_volume *vol;
2349	struct g_raid_subdisk *sd;
2350	struct g_raid_disk *disk;
2351	struct g_raid_md_intel_object *mdi;
2352	struct g_raid_md_intel_pervolume *pv;
2353	struct g_raid_md_intel_perdisk *pd;
2354	struct intel_raid_conf *meta;
2355	struct intel_raid_vol *mvol;
2356	struct intel_raid_map *mmap0, *mmap1;
2357	off_t sectorsize = 512, pos;
2358	const char *version, *cv;
2359	int vi, sdi, numdisks, len, state, stale;
2360
2361	sc = md->mdo_softc;
2362	mdi = (struct g_raid_md_intel_object *)md;
2363
2364	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2365		return (0);
2366
2367	/* Bump generation. Newly written metadata may differ from previous. */
2368	mdi->mdio_generation++;
2369
2370	/* Count number of disks. */
2371	numdisks = 0;
2372	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2373		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2374		if (pd->pd_disk_pos < 0)
2375			continue;
2376		numdisks++;
2377		if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2378			pd->pd_disk_meta.flags =
2379			    INTEL_F_ONLINE | INTEL_F_ASSIGNED;
2380		} else if (disk->d_state == G_RAID_DISK_S_FAILED) {
2381			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2382			    INTEL_F_ASSIGNED;
2383		} else if (disk->d_state == G_RAID_DISK_S_DISABLED) {
2384			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2385			    INTEL_F_ASSIGNED | INTEL_F_DISABLED;
2386		} else {
2387			if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED))
2388				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
2389			if (pd->pd_disk_meta.id != 0xffffffff) {
2390				pd->pd_disk_meta.id = 0xffffffff;
2391				len = strlen(pd->pd_disk_meta.serial);
2392				len = min(len, INTEL_SERIAL_LEN - 3);
2393				strcpy(pd->pd_disk_meta.serial + len, ":0");
2394			}
2395		}
2396	}
2397
2398	/* Fill anchor and disks. */
2399	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
2400	    M_MD_INTEL, M_WAITOK | M_ZERO);
2401	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
2402	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
2403	meta->config_id = mdi->mdio_config_id;
2404	meta->orig_config_id = mdi->mdio_orig_config_id;
2405	meta->generation = mdi->mdio_generation;
2406	meta->attributes = INTEL_ATTR_CHECKSUM;
2407	meta->total_disks = numdisks;
2408	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2409		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2410		if (pd->pd_disk_pos < 0)
2411			continue;
2412		meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta;
2413		if (pd->pd_disk_meta.sectors_hi != 0)
2414			meta->attributes |= INTEL_ATTR_2TB_DISK;
2415	}
2416
2417	/* Fill volumes and maps. */
2418	vi = 0;
2419	version = INTEL_VERSION_1000;
2420	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2421		pv = vol->v_md_data;
2422		if (vol->v_stopping)
2423			continue;
2424		mvol = intel_get_volume(meta, vi);
2425
2426		/* New metadata may have different volumes order. */
2427		pv->pv_volume_pos = vi;
2428
2429		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2430			sd = &vol->v_subdisks[sdi];
2431			if (sd->sd_disk != NULL)
2432				break;
2433		}
2434		if (sdi >= vol->v_disks_count)
2435			panic("No any filled subdisk in volume");
2436		if (vol->v_mediasize >= 0x20000000000llu)
2437			meta->attributes |= INTEL_ATTR_2TB;
2438		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2439			meta->attributes |= INTEL_ATTR_RAID0;
2440		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2441			meta->attributes |= INTEL_ATTR_RAID1;
2442		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2443			meta->attributes |= INTEL_ATTR_RAID5;
2444		else if ((vol->v_disks_count & 1) == 0)
2445			meta->attributes |= INTEL_ATTR_RAID10;
2446		else
2447			meta->attributes |= INTEL_ATTR_RAID1E;
2448		if (pv->pv_cng)
2449			meta->attributes |= INTEL_ATTR_RAIDCNG;
2450		if (vol->v_strip_size > 131072)
2451			meta->attributes |= INTEL_ATTR_EXT_STRIP;
2452
2453		if (pv->pv_cng)
2454			cv = INTEL_VERSION_1206;
2455		else if (vol->v_disks_count > 4)
2456			cv = INTEL_VERSION_1204;
2457		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2458			cv = INTEL_VERSION_1202;
2459		else if (vol->v_disks_count > 2)
2460			cv = INTEL_VERSION_1201;
2461		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2462			cv = INTEL_VERSION_1100;
2463		else
2464			cv = INTEL_VERSION_1000;
2465		if (strcmp(cv, version) > 0)
2466			version = cv;
2467
2468		strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name));
2469		mvol->total_sectors = vol->v_mediasize / sectorsize;
2470		mvol->state = (INTEL_ST_READ_COALESCING |
2471		    INTEL_ST_WRITE_COALESCING);
2472		mvol->tid = vol->v_global_id + 1;
2473		if (pv->pv_cng) {
2474			mvol->state |= INTEL_ST_CLONE_N_GO;
2475			if (pv->pv_cng_man_sync)
2476				mvol->state |= INTEL_ST_CLONE_MAN_SYNC;
2477			mvol->cng_master_disk = pv->pv_cng_master_disk;
2478			if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state ==
2479			    G_RAID_SUBDISK_S_NONE)
2480				mvol->cng_state = INTEL_CNGST_MASTER_MISSING;
2481			else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
2482				mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE;
2483			else
2484				mvol->cng_state = INTEL_CNGST_UPDATED;
2485		}
2486
2487		/* Check for any recovery in progress. */
2488		state = G_RAID_SUBDISK_S_ACTIVE;
2489		pos = 0x7fffffffffffffffllu;
2490		stale = 0;
2491		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2492			sd = &vol->v_subdisks[sdi];
2493			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
2494				state = G_RAID_SUBDISK_S_REBUILD;
2495			else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC &&
2496			    state != G_RAID_SUBDISK_S_REBUILD)
2497				state = G_RAID_SUBDISK_S_RESYNC;
2498			else if (sd->sd_state == G_RAID_SUBDISK_S_STALE)
2499				stale = 1;
2500			if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2501			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
2502			     sd->sd_rebuild_pos < pos)
2503			        pos = sd->sd_rebuild_pos;
2504		}
2505		if (state == G_RAID_SUBDISK_S_REBUILD) {
2506			mvol->migr_state = 1;
2507			mvol->migr_type = INTEL_MT_REBUILD;
2508		} else if (state == G_RAID_SUBDISK_S_RESYNC) {
2509			mvol->migr_state = 1;
2510			/* mvol->migr_type = INTEL_MT_REPAIR; */
2511			mvol->migr_type = INTEL_MT_VERIFY;
2512			mvol->state |= INTEL_ST_VERIFY_AND_FIX;
2513		} else
2514			mvol->migr_state = 0;
2515		mvol->dirty = (vol->v_dirty || stale);
2516
2517		mmap0 = intel_get_map(mvol, 0);
2518
2519		/* Write map / common part of two maps. */
2520		intel_set_map_offset(mmap0, sd->sd_offset / sectorsize);
2521		intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize);
2522		mmap0->strip_sectors = vol->v_strip_size / sectorsize;
2523		if (vol->v_state == G_RAID_VOLUME_S_BROKEN)
2524			mmap0->status = INTEL_S_FAILURE;
2525		else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED)
2526			mmap0->status = INTEL_S_DEGRADED;
2527		else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED)
2528		    == g_raid_nsubdisks(vol, -1))
2529			mmap0->status = INTEL_S_UNINITIALIZED;
2530		else
2531			mmap0->status = INTEL_S_READY;
2532		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2533			mmap0->type = INTEL_T_RAID0;
2534		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
2535		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2536			mmap0->type = INTEL_T_RAID1;
2537		else
2538			mmap0->type = INTEL_T_RAID5;
2539		mmap0->total_disks = vol->v_disks_count;
2540		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2541			mmap0->total_domains = vol->v_disks_count;
2542		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2543			mmap0->total_domains = 2;
2544		else
2545			mmap0->total_domains = 1;
2546		intel_set_map_stripe_count(mmap0,
2547		    sd->sd_size / vol->v_strip_size / mmap0->total_domains);
2548		mmap0->failed_disk_num = 0xff;
2549		mmap0->ddf = 1;
2550
2551		/* If there are two maps - copy common and update. */
2552		if (mvol->migr_state) {
2553			intel_set_vol_curr_migr_unit(mvol,
2554			    pos / vol->v_strip_size / mmap0->total_domains);
2555			mmap1 = intel_get_map(mvol, 1);
2556			memcpy(mmap1, mmap0, sizeof(struct intel_raid_map));
2557			mmap0->status = INTEL_S_READY;
2558		} else
2559			mmap1 = NULL;
2560
2561		/* Write disk indexes and put rebuild flags. */
2562		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2563			sd = &vol->v_subdisks[sdi];
2564			pd = (struct g_raid_md_intel_perdisk *)
2565			    sd->sd_disk->d_md_data;
2566			mmap0->disk_idx[sdi] = pd->pd_disk_pos;
2567			if (mvol->migr_state)
2568				mmap1->disk_idx[sdi] = pd->pd_disk_pos;
2569			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2570			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2571				mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2572			} else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
2573			    sd->sd_state != G_RAID_SUBDISK_S_STALE &&
2574			    sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) {
2575				mmap0->disk_idx[sdi] |= INTEL_DI_RBLD;
2576				if (mvol->migr_state)
2577					mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2578			}
2579			if ((sd->sd_state == G_RAID_SUBDISK_S_NONE ||
2580			     sd->sd_state == G_RAID_SUBDISK_S_FAILED ||
2581			     sd->sd_state == G_RAID_SUBDISK_S_REBUILD) &&
2582			    mmap0->failed_disk_num == 0xff) {
2583				mmap0->failed_disk_num = sdi;
2584				if (mvol->migr_state)
2585					mmap1->failed_disk_num = sdi;
2586			}
2587		}
2588		vi++;
2589	}
2590	meta->total_volumes = vi;
2591	if (vi > 1 || meta->attributes &
2592	     (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB))
2593		version = INTEL_VERSION_1300;
2594	if (strcmp(version, INTEL_VERSION_1300) < 0)
2595		meta->attributes &= INTEL_ATTR_CHECKSUM;
2596	memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1);
2597
2598	/* We are done. Print meta data and store them to disks. */
2599	g_raid_md_intel_print(meta);
2600	if (mdi->mdio_meta != NULL)
2601		free(mdi->mdio_meta, M_MD_INTEL);
2602	mdi->mdio_meta = meta;
2603	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2604		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2605		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
2606			continue;
2607		if (pd->pd_meta != NULL) {
2608			free(pd->pd_meta, M_MD_INTEL);
2609			pd->pd_meta = NULL;
2610		}
2611		pd->pd_meta = intel_meta_copy(meta);
2612		intel_meta_write(disk->d_consumer, meta);
2613	}
2614	return (0);
2615}
2616
2617static int
2618g_raid_md_fail_disk_intel(struct g_raid_md_object *md,
2619    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2620{
2621	struct g_raid_softc *sc;
2622	struct g_raid_md_intel_object *mdi;
2623	struct g_raid_md_intel_perdisk *pd;
2624	struct g_raid_subdisk *sd;
2625
2626	sc = md->mdo_softc;
2627	mdi = (struct g_raid_md_intel_object *)md;
2628	pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data;
2629
2630	/* We can't fail disk that is not a part of array now. */
2631	if (pd->pd_disk_pos < 0)
2632		return (-1);
2633
2634	/*
2635	 * Mark disk as failed in metadata and try to write that metadata
2636	 * to the disk itself to prevent it's later resurrection as STALE.
2637	 */
2638	mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED;
2639	pd->pd_disk_meta.flags = INTEL_F_FAILED;
2640	g_raid_md_intel_print(mdi->mdio_meta);
2641	if (tdisk->d_consumer)
2642		intel_meta_write(tdisk->d_consumer, mdi->mdio_meta);
2643
2644	/* Change states. */
2645	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
2646	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
2647		g_raid_change_subdisk_state(sd,
2648		    G_RAID_SUBDISK_S_FAILED);
2649		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
2650		    G_RAID_EVENT_SUBDISK);
2651	}
2652
2653	/* Write updated metadata to remaining disks. */
2654	g_raid_md_write_intel(md, NULL, NULL, tdisk);
2655
2656	/* Check if anything left except placeholders. */
2657	if (g_raid_ndisks(sc, -1) ==
2658	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2659		g_raid_destroy_node(sc, 0);
2660	else
2661		g_raid_md_intel_refill(sc);
2662	return (0);
2663}
2664
2665static int
2666g_raid_md_free_disk_intel(struct g_raid_md_object *md,
2667    struct g_raid_disk *disk)
2668{
2669	struct g_raid_md_intel_perdisk *pd;
2670
2671	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2672	if (pd->pd_meta != NULL) {
2673		free(pd->pd_meta, M_MD_INTEL);
2674		pd->pd_meta = NULL;
2675	}
2676	free(pd, M_MD_INTEL);
2677	disk->d_md_data = NULL;
2678	return (0);
2679}
2680
2681static int
2682g_raid_md_free_volume_intel(struct g_raid_md_object *md,
2683    struct g_raid_volume *vol)
2684{
2685	struct g_raid_md_intel_pervolume *pv;
2686
2687	pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data;
2688	free(pv, M_MD_INTEL);
2689	vol->v_md_data = NULL;
2690	return (0);
2691}
2692
2693static int
2694g_raid_md_free_intel(struct g_raid_md_object *md)
2695{
2696	struct g_raid_md_intel_object *mdi;
2697
2698	mdi = (struct g_raid_md_intel_object *)md;
2699	if (!mdi->mdio_started) {
2700		mdi->mdio_started = 0;
2701		callout_stop(&mdi->mdio_start_co);
2702		G_RAID_DEBUG1(1, md->mdo_softc,
2703		    "root_mount_rel %p", mdi->mdio_rootmount);
2704		root_mount_rel(mdi->mdio_rootmount);
2705		mdi->mdio_rootmount = NULL;
2706	}
2707	if (mdi->mdio_meta != NULL) {
2708		free(mdi->mdio_meta, M_MD_INTEL);
2709		mdi->mdio_meta = NULL;
2710	}
2711	return (0);
2712}
2713
2714G_RAID_MD_DECLARE(intel, "Intel");
2715