1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 Andrey Sokolov 26 * Copyright 2016 Toomas Soome <tsoome@me.com> 27 * Copyright 2019 Joyent, Inc. 28 */ 29 30 /* 31 * lofi (loopback file) driver - allows you to attach a file to a device, 32 * which can then be accessed through that device. The simple model is that 33 * you tell lofi to open a file, and then use the block device you get as 34 * you would any block device. lofi translates access to the block device 35 * into I/O on the underlying file. This is mostly useful for 36 * mounting images of filesystems. 37 * 38 * lofi is controlled through /dev/lofictl - this is the only device exported 39 * during attach, and is instance number 0. lofiadm communicates with lofi 40 * through ioctls on this device. When a file is attached to lofi, block and 41 * character devices are exported in /dev/lofi and /dev/rlofi. These devices 42 * are identified by lofi instance number, and the instance number is also used 43 * as the name in /dev/lofi. 44 * 45 * Virtual disks, or, labeled lofi, implements virtual disk support to 46 * support partition table and related tools. Such mappings will cause 47 * block and character devices to be exported in /dev/dsk and /dev/rdsk 48 * directories. 49 * 50 * To support virtual disks, the instance number space is divided to two 51 * parts, upper part for instance number and lower part for minor number 52 * space to identify partitions and slices. The virtual disk support is 53 * implemented by stacking cmlb module. For virtual disks, the partition 54 * related ioctl calls are routed to cmlb module. Compression and encryption 55 * is not supported for virtual disks. 56 * 57 * Mapped devices are tracked with state structures handled with 58 * ddi_soft_state(9F) for simplicity. 59 * 60 * A file attached to lofi is opened when attached and not closed until 61 * explicitly detached from lofi. This seems more sensible than deferring 62 * the open until the /dev/lofi device is opened, for a number of reasons. 63 * One is that any failure is likely to be noticed by the person (or script) 64 * running lofiadm. Another is that it would be a security problem if the 65 * file was replaced by another one after being added but before being opened. 66 * 67 * The only hard part about lofi is the ioctls. In order to support things 68 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 69 * So it has to fake disk geometry and partition information. More may need 70 * to be faked if your favorite utility doesn't work and you think it should 71 * (fdformat doesn't work because it really wants to know the type of floppy 72 * controller to talk to, and that didn't seem easy to fake. Or possibly even 73 * necessary, since we have mkfs_pcfs now). 74 * 75 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 76 * support simulation of hotplug events, an optional force flag is provided. 77 * If a lofi device is open when a force detach is requested, then the 78 * underlying file is closed and any subsequent operations return EIO. When the 79 * device is closed for the last time, it will be cleaned up at that time. In 80 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 81 * detached but not removed. 82 * 83 * If detach was requested and lofi device is not open, we will perform 84 * unmap and remove the lofi instance. 85 * 86 * If the lofi device is open and the li_cleanup is set on ioctl request, 87 * we set ls_cleanup flag to notify the cleanup is requested, and the 88 * last lofi_close will perform the unmapping and this lofi instance will be 89 * removed. 90 * 91 * If the lofi device is open and the li_force is set on ioctl request, 92 * we set ls_cleanup flag to notify the cleanup is requested, 93 * we also set ls_vp_closereq to notify IO tasks to return EIO on new 94 * IO requests and wait in process IO count to become 0, indicating there 95 * are no more IO requests. Since ls_cleanup is set, the last lofi_close 96 * will perform unmap and this lofi instance will be removed. 97 * See also lofi_unmap_file() for details. 98 * 99 * Once ls_cleanup is set for the instance, we do not allow lofi_open() 100 * calls to succeed and can have last lofi_close() to remove the instance. 101 * 102 * Known problems: 103 * 104 * UFS logging. Mounting a UFS filesystem image "logging" 105 * works for basic copy testing but wedges during a build of ON through 106 * that image. Some deadlock in lufs holding the log mutex and then 107 * getting stuck on a buf. So for now, don't do that. 108 * 109 * Direct I/O. Since the filesystem data is being cached in the buffer 110 * cache, _and_ again in the underlying filesystem, it's tempting to 111 * enable direct I/O on the underlying file. Don't, because that deadlocks. 112 * I think to fix the cache-twice problem we might need filesystem support. 113 * 114 * Interesting things to do: 115 * 116 * Allow multiple files for each device. A poor-man's metadisk, basically. 117 * 118 * Pass-through ioctls on block devices. You can (though it's not 119 * documented), give lofi a block device as a file name. Then we shouldn't 120 * need to fake a geometry, however, it may be relevant if you're replacing 121 * metadisk, or using lofi to get crypto. 122 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1 123 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home. 124 * In fact this even makes sense if you have lofi "above" metadisk. 125 * 126 * Encryption: 127 * Each lofi device can have its own symmetric key and cipher. 128 * They are passed to us by lofiadm(1m) in the correct format for use 129 * with the misc/kcf crypto_* routines. 130 * 131 * Each block has its own IV, that is calculated in lofi_blk_mech(), based 132 * on the "master" key held in the lsp and the block number of the buffer. 133 */ 134 135 #include <sys/types.h> 136 #include <netinet/in.h> 137 #include <sys/sysmacros.h> 138 #include <sys/uio.h> 139 #include <sys/kmem.h> 140 #include <sys/cred.h> 141 #include <sys/mman.h> 142 #include <sys/errno.h> 143 #include <sys/aio_req.h> 144 #include <sys/stat.h> 145 #include <sys/file.h> 146 #include <sys/modctl.h> 147 #include <sys/conf.h> 148 #include <sys/debug.h> 149 #include <sys/vnode.h> 150 #include <sys/lofi.h> 151 #include <sys/lofi_impl.h> /* for cache structure */ 152 #include <sys/fcntl.h> 153 #include <sys/pathname.h> 154 #include <sys/filio.h> 155 #include <sys/fdio.h> 156 #include <sys/open.h> 157 #include <sys/disp.h> 158 #include <vm/seg_map.h> 159 #include <sys/ddi.h> 160 #include <sys/sunddi.h> 161 #include <sys/zmod.h> 162 #include <sys/id_space.h> 163 #include <sys/mkdev.h> 164 #include <sys/crypto/common.h> 165 #include <sys/crypto/api.h> 166 #include <sys/rctl.h> 167 #include <sys/vtoc.h> 168 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */ 169 #include <sys/scsi/impl/uscsi.h> 170 #include <sys/sysevent/dev.h> 171 #include <sys/efi_partition.h> 172 #include <sys/note.h> 173 #include <LzmaDec.h> 174 175 #define NBLOCKS_PROP_NAME "Nblocks" 176 #define SIZE_PROP_NAME "Size" 177 #define ZONE_PROP_NAME "zone" 178 179 #define SETUP_C_DATA(cd, buf, len) \ 180 (cd).cd_format = CRYPTO_DATA_RAW; \ 181 (cd).cd_offset = 0; \ 182 (cd).cd_miscdata = NULL; \ 183 (cd).cd_length = (len); \ 184 (cd).cd_raw.iov_base = (buf); \ 185 (cd).cd_raw.iov_len = (len); 186 187 #define UIO_CHECK(uio) \ 188 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \ 189 ((uio)->uio_resid % DEV_BSIZE) != 0) { \ 190 return (EINVAL); \ 191 } 192 193 #define LOFI_TIMEOUT 120 194 195 int lofi_timeout = LOFI_TIMEOUT; 196 static void *lofi_statep; 197 static kmutex_t lofi_lock; /* state lock */ 198 static id_space_t *lofi_id; /* lofi ID values */ 199 static list_t lofi_list; 200 static zone_key_t lofi_zone_key; 201 202 /* 203 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 204 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 205 * high. If we want to be assured that the underlying device is always busy, 206 * we must be sure that the number of bytes enqueued when the number of 207 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 208 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 209 * set maxalloc to be the maximum throughput (in bytes per second) of the 210 * underlying device divided by the minimum I/O size. We assume a realistic 211 * maximum throughput of one hundred megabytes per second; we set maxalloc on 212 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 213 */ 214 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 215 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 216 217 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC; 218 219 /* 220 * To avoid decompressing data in a compressed segment multiple times 221 * when accessing small parts of a segment's data, we cache and reuse 222 * the uncompressed segment's data. 223 * 224 * A single cached segment is sufficient to avoid lots of duplicate 225 * segment decompress operations. A small cache size also reduces the 226 * memory footprint. 227 * 228 * lofi_max_comp_cache is the maximum number of decompressed data segments 229 * cached for each compressed lofi image. It can be set to 0 to disable 230 * caching. 231 */ 232 233 uint32_t lofi_max_comp_cache = 1; 234 235 static int gzip_decompress(void *src, size_t srclen, void *dst, 236 size_t *destlen, int level); 237 238 static int lzma_decompress(void *src, size_t srclen, void *dst, 239 size_t *dstlen, int level); 240 241 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 242 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 243 {gzip_decompress, NULL, 6, "gzip-6"}, 244 {gzip_decompress, NULL, 9, "gzip-9"}, 245 {lzma_decompress, NULL, 0, "lzma"} 246 }; 247 248 static void lofi_strategy_task(void *); 249 static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, 250 size_t, void *); 251 static int lofi_tg_getinfo(dev_info_t *, int, void *, void *); 252 253 struct cmlb_tg_ops lofi_tg_ops = { 254 TG_DK_OPS_VERSION_1, 255 lofi_tg_rdwr, 256 lofi_tg_getinfo 257 }; 258 259 /*ARGSUSED*/ 260 static void 261 *SzAlloc(void *p, size_t size) 262 { 263 return (kmem_alloc(size, KM_SLEEP)); 264 } 265 266 /*ARGSUSED*/ 267 static void 268 SzFree(void *p, void *address, size_t size) 269 { 270 kmem_free(address, size); 271 } 272 273 static ISzAlloc g_Alloc = { SzAlloc, SzFree }; 274 275 /* 276 * Free data referenced by the linked list of cached uncompressed 277 * segments. 278 */ 279 static void 280 lofi_free_comp_cache(struct lofi_state *lsp) 281 { 282 struct lofi_comp_cache *lc; 283 284 while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) { 285 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 286 kmem_free(lc, sizeof (struct lofi_comp_cache)); 287 lsp->ls_comp_cache_count--; 288 } 289 ASSERT(lsp->ls_comp_cache_count == 0); 290 } 291 292 static int 293 is_opened(struct lofi_state *lsp) 294 { 295 int i; 296 boolean_t last = B_TRUE; 297 298 ASSERT(MUTEX_HELD(&lofi_lock)); 299 for (i = 0; i < LOFI_PART_MAX; i++) { 300 if (lsp->ls_open_lyr[i]) { 301 last = B_FALSE; 302 break; 303 } 304 } 305 306 for (i = 0; last && (i < OTYP_LYR); i++) { 307 if (lsp->ls_open_reg[i]) { 308 last = B_FALSE; 309 } 310 } 311 312 return (!last); 313 } 314 315 static void 316 lofi_set_cleanup(struct lofi_state *lsp) 317 { 318 ASSERT(MUTEX_HELD(&lofi_lock)); 319 320 lsp->ls_cleanup = B_TRUE; 321 322 /* wake up any threads waiting on dkiocstate */ 323 cv_broadcast(&lsp->ls_vp_cv); 324 } 325 326 static void 327 lofi_free_crypto(struct lofi_state *lsp) 328 { 329 ASSERT(MUTEX_HELD(&lofi_lock)); 330 331 if (lsp->ls_crypto_enabled) { 332 /* 333 * Clean up the crypto state so that it doesn't hang around 334 * in memory after we are done with it. 335 */ 336 if (lsp->ls_key.ck_data != NULL) { 337 bzero(lsp->ls_key.ck_data, 338 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 339 kmem_free(lsp->ls_key.ck_data, 340 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 341 lsp->ls_key.ck_data = NULL; 342 lsp->ls_key.ck_length = 0; 343 } 344 345 if (lsp->ls_mech.cm_param != NULL) { 346 kmem_free(lsp->ls_mech.cm_param, 347 lsp->ls_mech.cm_param_len); 348 lsp->ls_mech.cm_param = NULL; 349 lsp->ls_mech.cm_param_len = 0; 350 } 351 352 if (lsp->ls_iv_mech.cm_param != NULL) { 353 kmem_free(lsp->ls_iv_mech.cm_param, 354 lsp->ls_iv_mech.cm_param_len); 355 lsp->ls_iv_mech.cm_param = NULL; 356 lsp->ls_iv_mech.cm_param_len = 0; 357 } 358 359 mutex_destroy(&lsp->ls_crypto_lock); 360 } 361 } 362 363 /* ARGSUSED */ 364 static int 365 lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start, 366 size_t length, void *tg_cookie) 367 { 368 struct lofi_state *lsp; 369 buf_t *bp; 370 int instance; 371 int rv = 0; 372 373 instance = ddi_get_instance(dip); 374 if (instance == 0) /* control node does not have disk */ 375 return (ENXIO); 376 377 lsp = ddi_get_soft_state(lofi_statep, instance); 378 379 if (lsp == NULL) 380 return (ENXIO); 381 382 if (cmd != TG_READ && cmd != TG_WRITE) 383 return (EINVAL); 384 385 /* 386 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 387 */ 388 mutex_enter(&lsp->ls_vp_lock); 389 while (lsp->ls_vp_ready == B_FALSE) 390 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 391 mutex_exit(&lsp->ls_vp_lock); 392 393 if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) { 394 /* We can only transfer whole blocks at a time! */ 395 return (EINVAL); 396 } 397 398 bp = getrbuf(KM_SLEEP); 399 400 if (cmd == TG_READ) { 401 bp->b_flags = B_READ; 402 } else { 403 if (lsp->ls_readonly == B_TRUE) { 404 freerbuf(bp); 405 return (EROFS); 406 } 407 bp->b_flags = B_WRITE; 408 } 409 410 bp->b_un.b_addr = bufaddr; 411 bp->b_bcount = length; 412 bp->b_lblkno = start; 413 bp->b_private = NULL; 414 bp->b_edev = lsp->ls_dev; 415 416 if (lsp->ls_kstat) { 417 mutex_enter(lsp->ls_kstat->ks_lock); 418 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 419 mutex_exit(lsp->ls_kstat->ks_lock); 420 } 421 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 422 (void) biowait(bp); 423 424 rv = geterror(bp); 425 freerbuf(bp); 426 return (rv); 427 } 428 429 /* 430 * Get device geometry info for cmlb. 431 * 432 * We have mapped disk image as virtual block device and have to report 433 * physical/virtual geometry to cmlb. 434 * 435 * So we have two principal cases: 436 * 1. Uninitialised image without any existing labels, 437 * for this case we fabricate the data based on mapped image. 438 * 2. Image with existing label information. 439 * Since we have no information how the image was created (it may be 440 * dump from some physical device), we need to rely on label information 441 * from image, or we get "corrupted label" errors. 442 * NOTE: label can be MBR, MBR+SMI, GPT 443 */ 444 static int 445 lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) 446 { 447 struct lofi_state *lsp; 448 int instance; 449 int ashift; 450 451 _NOTE(ARGUNUSED(tg_cookie)); 452 instance = ddi_get_instance(dip); 453 if (instance == 0) /* control device has no storage */ 454 return (ENXIO); 455 456 lsp = ddi_get_soft_state(lofi_statep, instance); 457 458 if (lsp == NULL) 459 return (ENXIO); 460 461 /* 462 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 463 * 464 * When mapping is created, new lofi instance is created and 465 * lofi_attach() will call cmlb_attach() as part of the procedure 466 * to set the mapping up. This chain of events will happen in 467 * the same thread. 468 * Since cmlb_attach() will call lofi_tg_getinfo to get 469 * capacity, we return error on that call if cookie is set, 470 * otherwise lofi_attach will be stuck as the mapping is not yet 471 * finalized and lofi is not yet ready. 472 * Note, such error is not fatal for cmlb, as the label setup 473 * will be finalized when cmlb_validate() is called. 474 */ 475 mutex_enter(&lsp->ls_vp_lock); 476 if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) { 477 mutex_exit(&lsp->ls_vp_lock); 478 return (ENXIO); 479 } 480 while (lsp->ls_vp_ready == B_FALSE) 481 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 482 mutex_exit(&lsp->ls_vp_lock); 483 484 ashift = lsp->ls_lbshift; 485 486 switch (cmd) { 487 case TG_GETPHYGEOM: { 488 cmlb_geom_t *geomp = arg; 489 490 geomp->g_capacity = 491 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 492 geomp->g_nsect = lsp->ls_dkg.dkg_nsect; 493 geomp->g_nhead = lsp->ls_dkg.dkg_nhead; 494 geomp->g_acyl = lsp->ls_dkg.dkg_acyl; 495 geomp->g_ncyl = lsp->ls_dkg.dkg_ncyl; 496 geomp->g_secsize = (1U << ashift); 497 geomp->g_intrlv = lsp->ls_dkg.dkg_intrlv; 498 geomp->g_rpm = lsp->ls_dkg.dkg_rpm; 499 return (0); 500 } 501 502 case TG_GETCAPACITY: 503 *(diskaddr_t *)arg = 504 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 505 return (0); 506 507 case TG_GETBLOCKSIZE: 508 *(uint32_t *)arg = (1U << ashift); 509 return (0); 510 511 case TG_GETATTR: { 512 tg_attribute_t *tgattr = arg; 513 514 tgattr->media_is_writable = !lsp->ls_readonly; 515 tgattr->media_is_solid_state = B_FALSE; 516 tgattr->media_is_rotational = B_FALSE; 517 return (0); 518 } 519 520 default: 521 return (EINVAL); 522 } 523 } 524 525 static void 526 lofi_destroy(struct lofi_state *lsp, cred_t *credp) 527 { 528 int id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 529 int i; 530 531 ASSERT(MUTEX_HELD(&lofi_lock)); 532 533 /* 534 * Before we can start to release the other resources, 535 * make sure we have all tasks completed and taskq removed. 536 */ 537 if (lsp->ls_taskq != NULL) { 538 taskq_destroy(lsp->ls_taskq); 539 lsp->ls_taskq = NULL; 540 } 541 542 list_remove(&lofi_list, lsp); 543 544 lofi_free_crypto(lsp); 545 546 /* 547 * Free pre-allocated compressed buffers 548 */ 549 if (lsp->ls_comp_bufs != NULL) { 550 for (i = 0; i < lofi_taskq_nthreads; i++) { 551 if (lsp->ls_comp_bufs[i].bufsize > 0) 552 kmem_free(lsp->ls_comp_bufs[i].buf, 553 lsp->ls_comp_bufs[i].bufsize); 554 } 555 kmem_free(lsp->ls_comp_bufs, 556 sizeof (struct compbuf) * lofi_taskq_nthreads); 557 } 558 559 if (lsp->ls_vp != NULL) { 560 (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL); 561 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 562 1, 0, credp, NULL); 563 VN_RELE(lsp->ls_vp); 564 } 565 if (lsp->ls_stacked_vp != lsp->ls_vp) 566 VN_RELE(lsp->ls_stacked_vp); 567 lsp->ls_vp = lsp->ls_stacked_vp = NULL; 568 569 if (lsp->ls_kstat != NULL) { 570 kstat_delete(lsp->ls_kstat); 571 lsp->ls_kstat = NULL; 572 } 573 574 /* 575 * Free cached decompressed segment data 576 */ 577 lofi_free_comp_cache(lsp); 578 list_destroy(&lsp->ls_comp_cache); 579 580 if (lsp->ls_uncomp_seg_sz > 0) { 581 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 582 lsp->ls_uncomp_seg_sz = 0; 583 } 584 585 rctl_decr_lofi(lsp->ls_zone.zref_zone, 1); 586 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 587 588 mutex_destroy(&lsp->ls_comp_cache_lock); 589 mutex_destroy(&lsp->ls_comp_bufs_lock); 590 mutex_destroy(&lsp->ls_kstat_lock); 591 mutex_destroy(&lsp->ls_vp_lock); 592 cv_destroy(&lsp->ls_vp_cv); 593 lsp->ls_vp_ready = B_FALSE; 594 lsp->ls_vp_closereq = B_FALSE; 595 596 ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp); 597 (void) ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE); 598 id_free(lofi_id, id); 599 } 600 601 static void 602 lofi_free_dev(struct lofi_state *lsp) 603 { 604 ASSERT(MUTEX_HELD(&lofi_lock)); 605 606 if (lsp->ls_cmlbhandle != NULL) { 607 cmlb_invalidate(lsp->ls_cmlbhandle, 0); 608 cmlb_detach(lsp->ls_cmlbhandle, 0); 609 cmlb_free_handle(&lsp->ls_cmlbhandle); 610 lsp->ls_cmlbhandle = NULL; 611 } 612 (void) ddi_prop_remove_all(lsp->ls_dip); 613 ddi_remove_minor_node(lsp->ls_dip, NULL); 614 } 615 616 /*ARGSUSED*/ 617 static void 618 lofi_zone_shutdown(zoneid_t zoneid, void *arg) 619 { 620 struct lofi_state *lsp; 621 struct lofi_state *next; 622 623 mutex_enter(&lofi_lock); 624 625 for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) { 626 627 /* lofi_destroy() frees lsp */ 628 next = list_next(&lofi_list, lsp); 629 630 if (lsp->ls_zone.zref_zone->zone_id != zoneid) 631 continue; 632 633 /* 634 * No in-zone processes are running, but something has this 635 * open. It's either a global zone process, or a lofi 636 * mount. In either case we set ls_cleanup so the last 637 * user destroys the device. 638 */ 639 if (is_opened(lsp)) { 640 lofi_set_cleanup(lsp); 641 } else { 642 lofi_free_dev(lsp); 643 lofi_destroy(lsp, kcred); 644 } 645 } 646 647 mutex_exit(&lofi_lock); 648 } 649 650 /*ARGSUSED*/ 651 static int 652 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 653 { 654 int id; 655 minor_t part; 656 uint64_t mask; 657 diskaddr_t nblks; 658 diskaddr_t lba; 659 boolean_t ndelay; 660 661 struct lofi_state *lsp; 662 663 if (otyp >= OTYPCNT) 664 return (EINVAL); 665 666 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE; 667 668 /* 669 * lofiadm -a /dev/lofi/1 gets us here. 670 */ 671 if (mutex_owner(&lofi_lock) == curthread) 672 return (EINVAL); 673 674 mutex_enter(&lofi_lock); 675 676 id = LOFI_MINOR2ID(getminor(*devp)); 677 part = LOFI_PART(getminor(*devp)); 678 mask = (1U << part); 679 680 /* master control device */ 681 if (id == 0) { 682 mutex_exit(&lofi_lock); 683 return (0); 684 } 685 686 /* otherwise, the mapping should already exist */ 687 lsp = ddi_get_soft_state(lofi_statep, id); 688 if (lsp == NULL) { 689 mutex_exit(&lofi_lock); 690 return (EINVAL); 691 } 692 693 if (lsp->ls_cleanup == B_TRUE) { 694 mutex_exit(&lofi_lock); 695 return (ENXIO); 696 } 697 698 if (lsp->ls_vp == NULL) { 699 mutex_exit(&lofi_lock); 700 return (ENXIO); 701 } 702 703 if (lsp->ls_readonly && (flag & FWRITE)) { 704 mutex_exit(&lofi_lock); 705 return (EROFS); 706 } 707 708 if ((lsp->ls_open_excl) & (mask)) { 709 mutex_exit(&lofi_lock); 710 return (EBUSY); 711 } 712 713 if (flag & FEXCL) { 714 if (lsp->ls_open_lyr[part]) { 715 mutex_exit(&lofi_lock); 716 return (EBUSY); 717 } 718 for (int i = 0; i < OTYP_LYR; i++) { 719 if (lsp->ls_open_reg[i] & mask) { 720 mutex_exit(&lofi_lock); 721 return (EBUSY); 722 } 723 } 724 } 725 726 if (lsp->ls_cmlbhandle != NULL) { 727 if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) { 728 /* 729 * non-blocking opens are allowed to succeed to 730 * support format and fdisk to create partitioning. 731 */ 732 if (!ndelay) { 733 mutex_exit(&lofi_lock); 734 return (ENXIO); 735 } 736 } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba, 737 NULL, NULL, 0) == 0) { 738 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) { 739 mutex_exit(&lofi_lock); 740 return (ENXIO); 741 } 742 } else if (!ndelay) { 743 mutex_exit(&lofi_lock); 744 return (ENXIO); 745 } 746 } 747 748 if (otyp == OTYP_LYR) { 749 lsp->ls_open_lyr[part]++; 750 } else { 751 lsp->ls_open_reg[otyp] |= mask; 752 } 753 if (flag & FEXCL) { 754 lsp->ls_open_excl |= mask; 755 } 756 757 mutex_exit(&lofi_lock); 758 return (0); 759 } 760 761 /*ARGSUSED*/ 762 static int 763 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 764 { 765 minor_t part; 766 int id; 767 uint64_t mask; 768 struct lofi_state *lsp; 769 770 id = LOFI_MINOR2ID(getminor(dev)); 771 part = LOFI_PART(getminor(dev)); 772 mask = (1U << part); 773 774 mutex_enter(&lofi_lock); 775 lsp = ddi_get_soft_state(lofi_statep, id); 776 if (lsp == NULL) { 777 mutex_exit(&lofi_lock); 778 return (EINVAL); 779 } 780 781 if (id == 0) { 782 mutex_exit(&lofi_lock); 783 return (0); 784 } 785 786 if (lsp->ls_open_excl & mask) 787 lsp->ls_open_excl &= ~mask; 788 789 if (otyp == OTYP_LYR) { 790 lsp->ls_open_lyr[part]--; 791 } else { 792 lsp->ls_open_reg[otyp] &= ~mask; 793 } 794 795 /* 796 * If we forcibly closed the underlying device (li_force), or 797 * asked for cleanup (li_cleanup), finish up if we're the last 798 * out of the door. 799 */ 800 if (!is_opened(lsp) && 801 (lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) { 802 lofi_free_dev(lsp); 803 lofi_destroy(lsp, credp); 804 } 805 806 mutex_exit(&lofi_lock); 807 return (0); 808 } 809 810 /* 811 * Sets the mechanism's initialization vector (IV) if one is needed. 812 * The IV is computed from the data block number. lsp->ls_mech is 813 * altered so that: 814 * lsp->ls_mech.cm_param_len is set to the IV len. 815 * lsp->ls_mech.cm_param is set to the IV. 816 */ 817 static int 818 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno) 819 { 820 int ret; 821 crypto_data_t cdata; 822 char *iv; 823 size_t iv_len; 824 size_t min; 825 void *data; 826 size_t datasz; 827 828 ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock)); 829 830 if (lsp == NULL) 831 return (CRYPTO_DEVICE_ERROR); 832 833 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */ 834 if (lsp->ls_iv_type == IVM_NONE) { 835 return (CRYPTO_SUCCESS); 836 } 837 838 /* 839 * if kmem already alloced from previous call and it's the same size 840 * we need now, just recycle it; allocate new kmem only if we have to 841 */ 842 if (lsp->ls_mech.cm_param == NULL || 843 lsp->ls_mech.cm_param_len != lsp->ls_iv_len) { 844 iv_len = lsp->ls_iv_len; 845 iv = kmem_zalloc(iv_len, KM_SLEEP); 846 } else { 847 iv_len = lsp->ls_mech.cm_param_len; 848 iv = lsp->ls_mech.cm_param; 849 bzero(iv, iv_len); 850 } 851 852 switch (lsp->ls_iv_type) { 853 case IVM_ENC_BLKNO: 854 /* iv is not static, lblkno changes each time */ 855 data = &lblkno; 856 datasz = sizeof (lblkno); 857 break; 858 default: 859 data = 0; 860 datasz = 0; 861 break; 862 } 863 864 /* 865 * write blkno into the iv buffer padded on the left in case 866 * blkno ever grows bigger than its current longlong_t size 867 * or a variation other than blkno is used for the iv data 868 */ 869 min = MIN(datasz, iv_len); 870 bcopy(data, iv + (iv_len - min), min); 871 872 /* encrypt the data in-place to get the IV */ 873 SETUP_C_DATA(cdata, iv, iv_len); 874 875 ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key, 876 NULL, NULL, NULL); 877 if (ret != CRYPTO_SUCCESS) { 878 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)", 879 lblkno, ret); 880 if (lsp->ls_mech.cm_param != iv) 881 kmem_free(iv, iv_len); 882 883 return (ret); 884 } 885 886 /* clean up the iv from the last computation */ 887 if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv) 888 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len); 889 890 lsp->ls_mech.cm_param_len = iv_len; 891 lsp->ls_mech.cm_param = iv; 892 893 return (CRYPTO_SUCCESS); 894 } 895 896 /* 897 * Performs encryption and decryption of a chunk of data of size "len", 898 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of 899 * DEV_BSIZE. 900 */ 901 static int 902 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext, 903 caddr_t ciphertext, size_t len, boolean_t op_encrypt) 904 { 905 crypto_data_t cdata; 906 crypto_data_t wdata; 907 int ret; 908 longlong_t lblkno = bp->b_lblkno; 909 910 mutex_enter(&lsp->ls_crypto_lock); 911 912 /* 913 * though we could encrypt/decrypt entire "len" chunk of data, we need 914 * to break it into DEV_BSIZE pieces to capture blkno incrementing 915 */ 916 SETUP_C_DATA(cdata, plaintext, len); 917 cdata.cd_length = DEV_BSIZE; 918 if (ciphertext != NULL) { /* not in-place crypto */ 919 SETUP_C_DATA(wdata, ciphertext, len); 920 wdata.cd_length = DEV_BSIZE; 921 } 922 923 do { 924 ret = lofi_blk_mech(lsp, lblkno); 925 if (ret != CRYPTO_SUCCESS) 926 continue; 927 928 if (op_encrypt) { 929 ret = crypto_encrypt(&lsp->ls_mech, &cdata, 930 &lsp->ls_key, NULL, 931 ((ciphertext != NULL) ? &wdata : NULL), NULL); 932 } else { 933 ret = crypto_decrypt(&lsp->ls_mech, &cdata, 934 &lsp->ls_key, NULL, 935 ((ciphertext != NULL) ? &wdata : NULL), NULL); 936 } 937 938 cdata.cd_offset += DEV_BSIZE; 939 if (ciphertext != NULL) 940 wdata.cd_offset += DEV_BSIZE; 941 lblkno++; 942 } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len); 943 944 mutex_exit(&lsp->ls_crypto_lock); 945 946 if (ret != CRYPTO_SUCCESS) { 947 cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)", 948 op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()", 949 lblkno, ret); 950 } 951 952 return (ret); 953 } 954 955 #define RDWR_RAW 1 956 #define RDWR_BCOPY 2 957 958 static int 959 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 960 struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn) 961 { 962 ssize_t resid; 963 int isread; 964 int error; 965 966 /* 967 * Handles reads/writes for both plain and encrypted lofi 968 * Note: offset is already shifted by lsp->ls_crypto_offset 969 * when it gets here. 970 */ 971 972 isread = bp->b_flags & B_READ; 973 if (isread) { 974 if (method == RDWR_BCOPY) { 975 /* DO NOT update bp->b_resid for bcopy */ 976 bcopy(bcopy_locn, bufaddr, len); 977 error = 0; 978 } else { /* RDWR_RAW */ 979 error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len, 980 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 981 &resid); 982 bp->b_resid = resid; 983 } 984 if (lsp->ls_crypto_enabled && error == 0) { 985 if (lofi_crypto(lsp, bp, bufaddr, NULL, len, 986 B_FALSE) != CRYPTO_SUCCESS) { 987 /* 988 * XXX: original code didn't set residual 989 * back to len because no error was expected 990 * from bcopy() if encryption is not enabled 991 */ 992 if (method != RDWR_BCOPY) 993 bp->b_resid = len; 994 error = EIO; 995 } 996 } 997 return (error); 998 } else { 999 void *iobuf = bufaddr; 1000 1001 if (lsp->ls_crypto_enabled) { 1002 /* don't do in-place crypto to keep bufaddr intact */ 1003 iobuf = kmem_alloc(len, KM_SLEEP); 1004 if (lofi_crypto(lsp, bp, bufaddr, iobuf, len, 1005 B_TRUE) != CRYPTO_SUCCESS) { 1006 kmem_free(iobuf, len); 1007 if (method != RDWR_BCOPY) 1008 bp->b_resid = len; 1009 return (EIO); 1010 } 1011 } 1012 if (method == RDWR_BCOPY) { 1013 /* DO NOT update bp->b_resid for bcopy */ 1014 bcopy(iobuf, bcopy_locn, len); 1015 error = 0; 1016 } else { /* RDWR_RAW */ 1017 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len, 1018 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 1019 &resid); 1020 bp->b_resid = resid; 1021 } 1022 if (lsp->ls_crypto_enabled) { 1023 kmem_free(iobuf, len); 1024 } 1025 return (error); 1026 } 1027 } 1028 1029 static int 1030 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 1031 struct lofi_state *lsp) 1032 { 1033 int error; 1034 offset_t alignedoffset, mapoffset; 1035 size_t xfersize; 1036 int isread; 1037 int smflags; 1038 caddr_t mapaddr; 1039 size_t len; 1040 enum seg_rw srw; 1041 int save_error; 1042 1043 /* 1044 * Note: offset is already shifted by lsp->ls_crypto_offset 1045 * when it gets here. 1046 */ 1047 if (lsp->ls_crypto_enabled) 1048 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size); 1049 1050 /* 1051 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 1052 * an 8K boundary, but the buf transfer address may not be 1053 * aligned on more than a 512-byte boundary (we don't enforce 1054 * that even though we could). This matters since the initial 1055 * part of the transfer may not start at offset 0 within the 1056 * segmap'd chunk. So we have to compensate for that with 1057 * 'mapoffset'. Subsequent chunks always start off at the 1058 * beginning, and the last is capped by b_resid 1059 * 1060 * Visually, where "|" represents page map boundaries: 1061 * alignedoffset (mapaddr begins at this segmap boundary) 1062 * | offset (from beginning of file) 1063 * | | len 1064 * v v v 1065 * ===|====X========|====...======|========X====|==== 1066 * /-------------...---------------/ 1067 * ^ bp->b_bcount/bp->b_resid at start 1068 * /----/--------/----...------/--------/ 1069 * ^ ^ ^ ^ ^ 1070 * | | | | nth xfersize (<= MAXBSIZE) 1071 * | | 2nd thru n-1st xfersize (= MAXBSIZE) 1072 * | 1st xfersize (<= MAXBSIZE) 1073 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter) 1074 * 1075 * Notes: "alignedoffset" is "offset" rounded down to nearest 1076 * MAXBSIZE boundary. "len" is next page boundary of size 1077 * PAGESIZE after "alignedoffset". 1078 */ 1079 mapoffset = offset & MAXBOFFSET; 1080 alignedoffset = offset - mapoffset; 1081 bp->b_resid = bp->b_bcount; 1082 isread = bp->b_flags & B_READ; 1083 srw = isread ? S_READ : S_WRITE; 1084 do { 1085 xfersize = MIN(lsp->ls_vp_comp_size - offset, 1086 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 1087 len = roundup(mapoffset + xfersize, PAGESIZE); 1088 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 1089 alignedoffset, MAXBSIZE, 1, srw); 1090 /* 1091 * Now fault in the pages. This lets us check 1092 * for errors before we reference mapaddr and 1093 * try to resolve the fault in bcopy (which would 1094 * panic instead). And this can easily happen, 1095 * particularly if you've lofi'd a file over NFS 1096 * and someone deletes the file on the server. 1097 */ 1098 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 1099 len, F_SOFTLOCK, srw); 1100 if (error) { 1101 (void) segmap_release(segkmap, mapaddr, 0); 1102 if (FC_CODE(error) == FC_OBJERR) 1103 error = FC_ERRNO(error); 1104 else 1105 error = EIO; 1106 break; 1107 } 1108 /* error may be non-zero for encrypted lofi */ 1109 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize, 1110 RDWR_BCOPY, mapaddr + mapoffset); 1111 if (error == 0) { 1112 bp->b_resid -= xfersize; 1113 bufaddr += xfersize; 1114 offset += xfersize; 1115 } 1116 smflags = 0; 1117 if (isread) { 1118 smflags |= SM_FREE; 1119 /* 1120 * If we're reading an entire page starting 1121 * at a page boundary, there's a good chance 1122 * we won't need it again. Put it on the 1123 * head of the freelist. 1124 */ 1125 if (mapoffset == 0 && xfersize == MAXBSIZE) 1126 smflags |= SM_DONTNEED; 1127 } else { 1128 /* 1129 * Write back good pages, it is okay to 1130 * always release asynchronous here as we'll 1131 * follow with VOP_FSYNC for B_SYNC buffers. 1132 */ 1133 if (error == 0) 1134 smflags |= SM_WRITE | SM_ASYNC; 1135 } 1136 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 1137 len, F_SOFTUNLOCK, srw); 1138 save_error = segmap_release(segkmap, mapaddr, smflags); 1139 if (error == 0) 1140 error = save_error; 1141 /* only the first map may start partial */ 1142 mapoffset = 0; 1143 alignedoffset += MAXBSIZE; 1144 } while ((error == 0) && (bp->b_resid > 0) && 1145 (offset < lsp->ls_vp_comp_size)); 1146 1147 return (error); 1148 } 1149 1150 /* 1151 * Check if segment seg_index is present in the decompressed segment 1152 * data cache. 1153 * 1154 * Returns a pointer to the decompressed segment data cache entry if 1155 * found, and NULL when decompressed data for this segment is not yet 1156 * cached. 1157 */ 1158 static struct lofi_comp_cache * 1159 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index) 1160 { 1161 struct lofi_comp_cache *lc; 1162 1163 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1164 1165 for (lc = list_head(&lsp->ls_comp_cache); lc != NULL; 1166 lc = list_next(&lsp->ls_comp_cache, lc)) { 1167 if (lc->lc_index == seg_index) { 1168 /* 1169 * Decompressed segment data was found in the 1170 * cache. 1171 * 1172 * The cache uses an LRU replacement strategy; 1173 * move the entry to head of list. 1174 */ 1175 list_remove(&lsp->ls_comp_cache, lc); 1176 list_insert_head(&lsp->ls_comp_cache, lc); 1177 return (lc); 1178 } 1179 } 1180 return (NULL); 1181 } 1182 1183 /* 1184 * Add the data for a decompressed segment at segment index 1185 * seg_index to the cache of the decompressed segments. 1186 * 1187 * Returns a pointer to the cache element structure in case 1188 * the data was added to the cache; returns NULL when the data 1189 * wasn't cached. 1190 */ 1191 static struct lofi_comp_cache * 1192 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index, 1193 uchar_t *data) 1194 { 1195 struct lofi_comp_cache *lc; 1196 1197 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1198 1199 while (lsp->ls_comp_cache_count > lofi_max_comp_cache) { 1200 lc = list_remove_tail(&lsp->ls_comp_cache); 1201 ASSERT(lc != NULL); 1202 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1203 kmem_free(lc, sizeof (struct lofi_comp_cache)); 1204 lsp->ls_comp_cache_count--; 1205 } 1206 1207 /* 1208 * Do not cache when disabled by tunable variable 1209 */ 1210 if (lofi_max_comp_cache == 0) 1211 return (NULL); 1212 1213 /* 1214 * When the cache has not yet reached the maximum allowed 1215 * number of segments, allocate a new cache element. 1216 * Otherwise the cache is full; reuse the last list element 1217 * (LRU) for caching the decompressed segment data. 1218 * 1219 * The cache element for the new decompressed segment data is 1220 * added to the head of the list. 1221 */ 1222 if (lsp->ls_comp_cache_count < lofi_max_comp_cache) { 1223 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP); 1224 lc->lc_data = NULL; 1225 list_insert_head(&lsp->ls_comp_cache, lc); 1226 lsp->ls_comp_cache_count++; 1227 } else { 1228 lc = list_remove_tail(&lsp->ls_comp_cache); 1229 if (lc == NULL) 1230 return (NULL); 1231 list_insert_head(&lsp->ls_comp_cache, lc); 1232 } 1233 1234 /* 1235 * Free old uncompressed segment data when reusing a cache 1236 * entry. 1237 */ 1238 if (lc->lc_data != NULL) 1239 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1240 1241 lc->lc_data = data; 1242 lc->lc_index = seg_index; 1243 return (lc); 1244 } 1245 1246 1247 /*ARGSUSED*/ 1248 static int 1249 gzip_decompress(void *src, size_t srclen, void *dst, 1250 size_t *dstlen, int level) 1251 { 1252 ASSERT(*dstlen >= srclen); 1253 1254 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 1255 return (-1); 1256 return (0); 1257 } 1258 1259 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8) 1260 /*ARGSUSED*/ 1261 static int 1262 lzma_decompress(void *src, size_t srclen, void *dst, 1263 size_t *dstlen, int level) 1264 { 1265 size_t insizepure; 1266 void *actual_src; 1267 ELzmaStatus status; 1268 1269 insizepure = srclen - LZMA_HEADER_SIZE; 1270 actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE); 1271 1272 if (LzmaDecode((Byte *)dst, (size_t *)dstlen, 1273 (const Byte *)actual_src, &insizepure, 1274 (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status, 1275 &g_Alloc) != SZ_OK) { 1276 return (-1); 1277 } 1278 return (0); 1279 } 1280 1281 /* 1282 * This is basically what strategy used to be before we found we 1283 * needed task queues. 1284 */ 1285 static void 1286 lofi_strategy_task(void *arg) 1287 { 1288 struct buf *bp = (struct buf *)arg; 1289 int error; 1290 int syncflag = 0; 1291 struct lofi_state *lsp; 1292 offset_t offset; 1293 caddr_t bufaddr; 1294 size_t len; 1295 size_t xfersize; 1296 boolean_t bufinited = B_FALSE; 1297 1298 lsp = ddi_get_soft_state(lofi_statep, 1299 LOFI_MINOR2ID(getminor(bp->b_edev))); 1300 1301 if (lsp == NULL) { 1302 error = ENXIO; 1303 goto errout; 1304 } 1305 if (lsp->ls_kstat) { 1306 mutex_enter(lsp->ls_kstat->ks_lock); 1307 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 1308 mutex_exit(lsp->ls_kstat->ks_lock); 1309 } 1310 1311 mutex_enter(&lsp->ls_vp_lock); 1312 lsp->ls_vp_iocount++; 1313 mutex_exit(&lsp->ls_vp_lock); 1314 1315 bp_mapin(bp); 1316 bufaddr = bp->b_un.b_addr; 1317 offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private) 1318 << lsp->ls_lbshift; /* offset within file */ 1319 if (lsp->ls_crypto_enabled) { 1320 /* encrypted data really begins after crypto header */ 1321 offset += lsp->ls_crypto_offset; 1322 } 1323 len = bp->b_bcount; 1324 bufinited = B_TRUE; 1325 1326 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1327 error = EIO; 1328 goto errout; 1329 } 1330 1331 /* 1332 * If we're writing and the buffer was not B_ASYNC 1333 * we'll follow up with a VOP_FSYNC() to force any 1334 * asynchronous I/O to stable storage. 1335 */ 1336 if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC)) 1337 syncflag = FSYNC; 1338 1339 /* 1340 * We used to always use vn_rdwr here, but we cannot do that because 1341 * we might decide to read or write from the the underlying 1342 * file during this call, which would be a deadlock because 1343 * we have the rw_lock. So instead we page, unless it's not 1344 * mapable or it's a character device or it's an encrypted lofi. 1345 */ 1346 if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) || 1347 lsp->ls_crypto_enabled) { 1348 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW, 1349 NULL); 1350 } else if (lsp->ls_uncomp_seg_sz == 0) { 1351 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 1352 } else { 1353 uchar_t *compressed_seg = NULL, *cmpbuf; 1354 uchar_t *uncompressed_seg = NULL; 1355 lofi_compress_info_t *li; 1356 size_t oblkcount; 1357 ulong_t seglen; 1358 uint64_t sblkno, eblkno, cmpbytes; 1359 uint64_t uncompressed_seg_index; 1360 struct lofi_comp_cache *lc; 1361 offset_t sblkoff, eblkoff; 1362 u_offset_t salign, ealign; 1363 u_offset_t sdiff; 1364 uint32_t comp_data_sz; 1365 uint64_t i; 1366 int j; 1367 1368 /* 1369 * From here on we're dealing primarily with compressed files 1370 */ 1371 ASSERT(!lsp->ls_crypto_enabled); 1372 1373 /* 1374 * Compressed files can only be read from and 1375 * not written to 1376 */ 1377 if (!(bp->b_flags & B_READ)) { 1378 bp->b_resid = bp->b_bcount; 1379 error = EROFS; 1380 goto done; 1381 } 1382 1383 ASSERT(lsp->ls_comp_algorithm_index >= 0); 1384 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 1385 /* 1386 * Compute starting and ending compressed segment numbers 1387 * We use only bitwise operations avoiding division and 1388 * modulus because we enforce the compression segment size 1389 * to a power of 2 1390 */ 1391 sblkno = offset >> lsp->ls_comp_seg_shift; 1392 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 1393 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 1394 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 1395 1396 /* 1397 * Check the decompressed segment cache. 1398 * 1399 * The cache is used only when the requested data 1400 * is within a segment. Requests that cross 1401 * segment boundaries bypass the cache. 1402 */ 1403 if (sblkno == eblkno || 1404 (sblkno + 1 == eblkno && eblkoff == 0)) { 1405 /* 1406 * Request doesn't cross a segment boundary, 1407 * now check the cache. 1408 */ 1409 mutex_enter(&lsp->ls_comp_cache_lock); 1410 lc = lofi_find_comp_data(lsp, sblkno); 1411 if (lc != NULL) { 1412 /* 1413 * We've found the decompressed segment 1414 * data in the cache; reuse it. 1415 */ 1416 bcopy(lc->lc_data + sblkoff, bufaddr, 1417 bp->b_bcount); 1418 mutex_exit(&lsp->ls_comp_cache_lock); 1419 bp->b_resid = 0; 1420 error = 0; 1421 goto done; 1422 } 1423 mutex_exit(&lsp->ls_comp_cache_lock); 1424 } 1425 1426 /* 1427 * Align start offset to block boundary for segmap 1428 */ 1429 salign = lsp->ls_comp_seg_index[sblkno]; 1430 sdiff = salign & (DEV_BSIZE - 1); 1431 salign -= sdiff; 1432 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 1433 /* 1434 * We're dealing with the last segment of 1435 * the compressed file -- the size of this 1436 * segment *may not* be the same as the 1437 * segment size for the file 1438 */ 1439 eblkoff = (offset + bp->b_bcount) & 1440 (lsp->ls_uncomp_last_seg_sz - 1); 1441 ealign = lsp->ls_vp_comp_size; 1442 } else { 1443 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 1444 } 1445 1446 /* 1447 * Preserve original request paramaters 1448 */ 1449 oblkcount = bp->b_bcount; 1450 1451 /* 1452 * Assign the calculated parameters 1453 */ 1454 comp_data_sz = ealign - salign; 1455 bp->b_bcount = comp_data_sz; 1456 1457 /* 1458 * Buffers to hold compressed segments are pre-allocated 1459 * on a per-thread basis. Find a pre-allocated buffer 1460 * that is not currently in use and mark it for use. 1461 */ 1462 mutex_enter(&lsp->ls_comp_bufs_lock); 1463 for (j = 0; j < lofi_taskq_nthreads; j++) { 1464 if (lsp->ls_comp_bufs[j].inuse == 0) { 1465 lsp->ls_comp_bufs[j].inuse = 1; 1466 break; 1467 } 1468 } 1469 1470 mutex_exit(&lsp->ls_comp_bufs_lock); 1471 ASSERT(j < lofi_taskq_nthreads); 1472 1473 /* 1474 * If the pre-allocated buffer size does not match 1475 * the size of the I/O request, re-allocate it with 1476 * the appropriate size 1477 */ 1478 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) { 1479 if (lsp->ls_comp_bufs[j].bufsize > 0) 1480 kmem_free(lsp->ls_comp_bufs[j].buf, 1481 lsp->ls_comp_bufs[j].bufsize); 1482 lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount, 1483 KM_SLEEP); 1484 lsp->ls_comp_bufs[j].bufsize = bp->b_bcount; 1485 } 1486 compressed_seg = lsp->ls_comp_bufs[j].buf; 1487 1488 /* 1489 * Map in the calculated number of blocks 1490 */ 1491 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 1492 bp, lsp); 1493 1494 bp->b_bcount = oblkcount; 1495 bp->b_resid = oblkcount; 1496 if (error != 0) 1497 goto done; 1498 1499 /* 1500 * decompress compressed blocks start 1501 */ 1502 cmpbuf = compressed_seg + sdiff; 1503 for (i = sblkno; i <= eblkno; i++) { 1504 ASSERT(i < lsp->ls_comp_index_sz - 1); 1505 uchar_t *useg; 1506 1507 /* 1508 * The last segment is special in that it is 1509 * most likely not going to be the same 1510 * (uncompressed) size as the other segments. 1511 */ 1512 if (i == (lsp->ls_comp_index_sz - 2)) { 1513 seglen = lsp->ls_uncomp_last_seg_sz; 1514 } else { 1515 seglen = lsp->ls_uncomp_seg_sz; 1516 } 1517 1518 /* 1519 * Each of the segment index entries contains 1520 * the starting block number for that segment. 1521 * The number of compressed bytes in a segment 1522 * is thus the difference between the starting 1523 * block number of this segment and the starting 1524 * block number of the next segment. 1525 */ 1526 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 1527 lsp->ls_comp_seg_index[i]; 1528 1529 /* 1530 * The first byte in a compressed segment is a flag 1531 * that indicates whether this segment is compressed 1532 * at all. 1533 * 1534 * The variable 'useg' is used (instead of 1535 * uncompressed_seg) in this loop to keep a 1536 * reference to the uncompressed segment. 1537 * 1538 * N.B. If 'useg' is replaced with uncompressed_seg, 1539 * it leads to memory leaks and heap corruption in 1540 * corner cases where compressed segments lie 1541 * adjacent to uncompressed segments. 1542 */ 1543 if (*cmpbuf == UNCOMPRESSED) { 1544 useg = cmpbuf + SEGHDR; 1545 } else { 1546 if (uncompressed_seg == NULL) 1547 uncompressed_seg = 1548 kmem_alloc(lsp->ls_uncomp_seg_sz, 1549 KM_SLEEP); 1550 useg = uncompressed_seg; 1551 uncompressed_seg_index = i; 1552 1553 if (li->l_decompress((cmpbuf + SEGHDR), 1554 (cmpbytes - SEGHDR), uncompressed_seg, 1555 &seglen, li->l_level) != 0) { 1556 error = EIO; 1557 goto done; 1558 } 1559 } 1560 1561 /* 1562 * Determine how much uncompressed data we 1563 * have to copy and copy it 1564 */ 1565 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 1566 if (i == eblkno) 1567 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff); 1568 1569 bcopy((useg + sblkoff), bufaddr, xfersize); 1570 1571 cmpbuf += cmpbytes; 1572 bufaddr += xfersize; 1573 bp->b_resid -= xfersize; 1574 sblkoff = 0; 1575 1576 if (bp->b_resid == 0) 1577 break; 1578 } /* decompress compressed blocks ends */ 1579 1580 /* 1581 * Skip to done if there is no uncompressed data to cache 1582 */ 1583 if (uncompressed_seg == NULL) 1584 goto done; 1585 1586 /* 1587 * Add the data for the last decompressed segment to 1588 * the cache. 1589 * 1590 * In case the uncompressed segment data was added to (and 1591 * is referenced by) the cache, make sure we don't free it 1592 * here. 1593 */ 1594 mutex_enter(&lsp->ls_comp_cache_lock); 1595 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index, 1596 uncompressed_seg)) != NULL) { 1597 uncompressed_seg = NULL; 1598 } 1599 mutex_exit(&lsp->ls_comp_cache_lock); 1600 1601 done: 1602 if (compressed_seg != NULL) { 1603 mutex_enter(&lsp->ls_comp_bufs_lock); 1604 lsp->ls_comp_bufs[j].inuse = 0; 1605 mutex_exit(&lsp->ls_comp_bufs_lock); 1606 } 1607 if (uncompressed_seg != NULL) 1608 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 1609 } /* end of handling compressed files */ 1610 1611 if ((error == 0) && (syncflag != 0)) 1612 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL); 1613 1614 errout: 1615 if (bufinited && lsp->ls_kstat) { 1616 size_t n_done = bp->b_bcount - bp->b_resid; 1617 kstat_io_t *kioptr; 1618 1619 mutex_enter(lsp->ls_kstat->ks_lock); 1620 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 1621 if (bp->b_flags & B_READ) { 1622 kioptr->nread += n_done; 1623 kioptr->reads++; 1624 } else { 1625 kioptr->nwritten += n_done; 1626 kioptr->writes++; 1627 } 1628 kstat_runq_exit(kioptr); 1629 mutex_exit(lsp->ls_kstat->ks_lock); 1630 } 1631 1632 mutex_enter(&lsp->ls_vp_lock); 1633 if (--lsp->ls_vp_iocount == 0) 1634 cv_broadcast(&lsp->ls_vp_cv); 1635 mutex_exit(&lsp->ls_vp_lock); 1636 1637 bioerror(bp, error); 1638 biodone(bp); 1639 } 1640 1641 static int 1642 lofi_strategy(struct buf *bp) 1643 { 1644 struct lofi_state *lsp; 1645 offset_t offset; 1646 minor_t part; 1647 diskaddr_t p_lba; 1648 diskaddr_t p_nblks; 1649 int shift; 1650 1651 /* 1652 * We cannot just do I/O here, because the current thread 1653 * _might_ end up back in here because the underlying filesystem 1654 * wants a buffer, which eventually gets into bio_recycle and 1655 * might call into lofi to write out a delayed-write buffer. 1656 * This is bad if the filesystem above lofi is the same as below. 1657 * 1658 * We could come up with a complex strategy using threads to 1659 * do the I/O asynchronously, or we could use task queues. task 1660 * queues were incredibly easy so they win. 1661 */ 1662 1663 lsp = ddi_get_soft_state(lofi_statep, 1664 LOFI_MINOR2ID(getminor(bp->b_edev))); 1665 part = LOFI_PART(getminor(bp->b_edev)); 1666 1667 if (lsp == NULL) { 1668 bioerror(bp, ENXIO); 1669 biodone(bp); 1670 return (0); 1671 } 1672 1673 /* Check if we are closing. */ 1674 mutex_enter(&lsp->ls_vp_lock); 1675 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1676 mutex_exit(&lsp->ls_vp_lock); 1677 bioerror(bp, EIO); 1678 biodone(bp); 1679 return (0); 1680 } 1681 mutex_exit(&lsp->ls_vp_lock); 1682 1683 shift = lsp->ls_lbshift; 1684 p_lba = 0; 1685 p_nblks = lsp->ls_vp_size >> shift; 1686 1687 if (lsp->ls_cmlbhandle != NULL) { 1688 if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba, 1689 NULL, NULL, 0)) { 1690 bioerror(bp, ENXIO); 1691 biodone(bp); 1692 return (0); 1693 } 1694 } 1695 1696 /* start block past partition end? */ 1697 if (bp->b_lblkno > p_nblks) { 1698 bioerror(bp, ENXIO); 1699 biodone(bp); 1700 return (0); 1701 } 1702 1703 offset = (bp->b_lblkno+p_lba) << shift; /* offset within file */ 1704 1705 mutex_enter(&lsp->ls_vp_lock); 1706 if (lsp->ls_crypto_enabled) { 1707 /* encrypted data really begins after crypto header */ 1708 offset += lsp->ls_crypto_offset; 1709 } 1710 1711 /* make sure we will not pass the file or partition size */ 1712 if (offset == lsp->ls_vp_size || 1713 offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) { 1714 /* EOF */ 1715 if ((bp->b_flags & B_READ) != 0) { 1716 bp->b_resid = bp->b_bcount; 1717 bioerror(bp, 0); 1718 } else { 1719 /* writes should fail */ 1720 bioerror(bp, ENXIO); 1721 } 1722 biodone(bp); 1723 mutex_exit(&lsp->ls_vp_lock); 1724 return (0); 1725 } 1726 if ((offset > lsp->ls_vp_size) || 1727 (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) || 1728 ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) { 1729 bioerror(bp, ENXIO); 1730 biodone(bp); 1731 mutex_exit(&lsp->ls_vp_lock); 1732 return (0); 1733 } 1734 1735 mutex_exit(&lsp->ls_vp_lock); 1736 1737 if (lsp->ls_kstat) { 1738 mutex_enter(lsp->ls_kstat->ks_lock); 1739 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 1740 mutex_exit(lsp->ls_kstat->ks_lock); 1741 } 1742 bp->b_private = (void *)(uintptr_t)p_lba; /* partition start */ 1743 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 1744 return (0); 1745 } 1746 1747 static int 1748 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 1749 { 1750 _NOTE(ARGUNUSED(credp)); 1751 1752 if (getminor(dev) == 0) 1753 return (EINVAL); 1754 UIO_CHECK(uio); 1755 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 1756 } 1757 1758 static int 1759 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 1760 { 1761 _NOTE(ARGUNUSED(credp)); 1762 1763 if (getminor(dev) == 0) 1764 return (EINVAL); 1765 UIO_CHECK(uio); 1766 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 1767 } 1768 1769 static int 1770 lofi_urw(struct lofi_state *lsp, uint16_t fmode, diskaddr_t off, size_t size, 1771 intptr_t arg, int flag, cred_t *credp) 1772 { 1773 struct uio uio; 1774 iovec_t iov; 1775 1776 /* 1777 * 1024 * 1024 apes cmlb_tg_max_efi_xfer as a reasonable max. 1778 */ 1779 if (size == 0 || size > 1024 * 1024 || 1780 (size % (1 << lsp->ls_lbshift)) != 0) 1781 return (EINVAL); 1782 1783 iov.iov_base = (void *)arg; 1784 iov.iov_len = size; 1785 uio.uio_iov = &iov; 1786 uio.uio_iovcnt = 1; 1787 uio.uio_loffset = off; 1788 uio.uio_segflg = (flag & FKIOCTL) ? UIO_SYSSPACE : UIO_USERSPACE; 1789 uio.uio_llimit = MAXOFFSET_T; 1790 uio.uio_resid = size; 1791 uio.uio_fmode = fmode; 1792 uio.uio_extflg = 0; 1793 1794 return (fmode == FREAD ? 1795 lofi_read(lsp->ls_dev, &uio, credp) : 1796 lofi_write(lsp->ls_dev, &uio, credp)); 1797 } 1798 1799 /*ARGSUSED2*/ 1800 static int 1801 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 1802 { 1803 if (getminor(dev) == 0) 1804 return (EINVAL); 1805 UIO_CHECK(aio->aio_uio); 1806 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 1807 } 1808 1809 /*ARGSUSED2*/ 1810 static int 1811 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 1812 { 1813 if (getminor(dev) == 0) 1814 return (EINVAL); 1815 UIO_CHECK(aio->aio_uio); 1816 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 1817 } 1818 1819 /*ARGSUSED*/ 1820 static int 1821 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 1822 { 1823 struct lofi_state *lsp; 1824 dev_t dev = (dev_t)arg; 1825 int instance; 1826 1827 instance = LOFI_MINOR2ID(getminor(dev)); 1828 switch (infocmd) { 1829 case DDI_INFO_DEVT2DEVINFO: 1830 lsp = ddi_get_soft_state(lofi_statep, instance); 1831 if (lsp == NULL) 1832 return (DDI_FAILURE); 1833 *result = lsp->ls_dip; 1834 return (DDI_SUCCESS); 1835 case DDI_INFO_DEVT2INSTANCE: 1836 *result = (void *) (intptr_t)instance; 1837 return (DDI_SUCCESS); 1838 } 1839 return (DDI_FAILURE); 1840 } 1841 1842 static int 1843 lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled) 1844 { 1845 int error = 0; 1846 int instance = ddi_get_instance(lsp->ls_dip); 1847 1848 if (labeled == B_TRUE) { 1849 cmlb_alloc_handle(&lsp->ls_cmlbhandle); 1850 error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT, 1851 B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN, 1852 CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1); 1853 1854 if (error != DDI_SUCCESS) { 1855 cmlb_free_handle(&lsp->ls_cmlbhandle); 1856 lsp->ls_cmlbhandle = NULL; 1857 error = ENXIO; 1858 } 1859 } else { 1860 /* create minor nodes */ 1861 error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE, 1862 S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0); 1863 if (error == DDI_SUCCESS) { 1864 error = ddi_create_minor_node(lsp->ls_dip, 1865 LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance), 1866 DDI_PSEUDO, 0); 1867 if (error != DDI_SUCCESS) { 1868 ddi_remove_minor_node(lsp->ls_dip, 1869 LOFI_BLOCK_NODE); 1870 error = ENXIO; 1871 } 1872 } else 1873 error = ENXIO; 1874 } 1875 return (error); 1876 } 1877 1878 static int 1879 lofi_zone_bind(struct lofi_state *lsp) 1880 { 1881 int error = 0; 1882 1883 mutex_enter(&curproc->p_lock); 1884 if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) { 1885 mutex_exit(&curproc->p_lock); 1886 return (error); 1887 } 1888 mutex_exit(&curproc->p_lock); 1889 1890 if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME, 1891 (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) { 1892 rctl_decr_lofi(curproc->p_zone, 1); 1893 error = EINVAL; 1894 } else { 1895 zone_init_ref(&lsp->ls_zone); 1896 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI); 1897 } 1898 return (error); 1899 } 1900 1901 static void 1902 lofi_zone_unbind(struct lofi_state *lsp) 1903 { 1904 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME); 1905 rctl_decr_lofi(curproc->p_zone, 1); 1906 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 1907 } 1908 1909 static int 1910 lofi_online_dev(dev_info_t *dip) 1911 { 1912 boolean_t labeled; 1913 int error; 1914 int instance = ddi_get_instance(dip); 1915 struct lofi_state *lsp; 1916 1917 labeled = B_FALSE; 1918 if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled")) 1919 labeled = B_TRUE; 1920 1921 /* lsp alloc+init, soft state is freed in lofi_detach */ 1922 error = ddi_soft_state_zalloc(lofi_statep, instance); 1923 if (error == DDI_FAILURE) { 1924 return (ENOMEM); 1925 } 1926 1927 lsp = ddi_get_soft_state(lofi_statep, instance); 1928 lsp->ls_dip = dip; 1929 1930 if ((error = lofi_zone_bind(lsp)) != 0) 1931 goto err; 1932 1933 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1934 mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL); 1935 mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL); 1936 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1937 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1938 1939 if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) { 1940 lofi_zone_unbind(lsp); 1941 goto lerr; 1942 } 1943 1944 /* driver handles kernel-issued IOCTLs */ 1945 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1946 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1947 error = DDI_FAILURE; 1948 goto merr; 1949 } 1950 1951 lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance, 1952 NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid()); 1953 if (lsp->ls_kstat == NULL) { 1954 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, 1955 DDI_KERNEL_IOCTL); 1956 error = ENOMEM; 1957 goto merr; 1958 } 1959 1960 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1961 kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID); 1962 kstat_install(lsp->ls_kstat); 1963 return (DDI_SUCCESS); 1964 merr: 1965 if (lsp->ls_cmlbhandle != NULL) { 1966 cmlb_detach(lsp->ls_cmlbhandle, 0); 1967 cmlb_free_handle(&lsp->ls_cmlbhandle); 1968 } 1969 ddi_remove_minor_node(dip, NULL); 1970 lofi_zone_unbind(lsp); 1971 lerr: 1972 mutex_destroy(&lsp->ls_comp_cache_lock); 1973 mutex_destroy(&lsp->ls_comp_bufs_lock); 1974 mutex_destroy(&lsp->ls_kstat_lock); 1975 mutex_destroy(&lsp->ls_vp_lock); 1976 cv_destroy(&lsp->ls_vp_cv); 1977 err: 1978 ddi_soft_state_free(lofi_statep, instance); 1979 return (error); 1980 } 1981 1982 static int 1983 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1984 { 1985 int rv; 1986 int instance = ddi_get_instance(dip); 1987 struct lofi_state *lsp; 1988 1989 if (cmd != DDI_ATTACH) 1990 return (DDI_FAILURE); 1991 1992 /* 1993 * Instance 0 is control instance, attaching control instance 1994 * will set the lofi up and ready. 1995 */ 1996 if (instance == 0) { 1997 rv = ddi_soft_state_zalloc(lofi_statep, 0); 1998 if (rv == DDI_FAILURE) { 1999 return (DDI_FAILURE); 2000 } 2001 lsp = ddi_get_soft_state(lofi_statep, instance); 2002 rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 2003 DDI_PSEUDO, 0); 2004 if (rv == DDI_FAILURE) { 2005 ddi_soft_state_free(lofi_statep, 0); 2006 return (DDI_FAILURE); 2007 } 2008 /* driver handles kernel-issued IOCTLs */ 2009 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 2010 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 2011 ddi_remove_minor_node(dip, NULL); 2012 ddi_soft_state_free(lofi_statep, 0); 2013 return (DDI_FAILURE); 2014 } 2015 2016 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL); 2017 2018 lsp->ls_dip = dip; 2019 } else { 2020 if (lofi_online_dev(dip) == DDI_FAILURE) 2021 return (DDI_FAILURE); 2022 } 2023 2024 ddi_report_dev(dip); 2025 return (DDI_SUCCESS); 2026 } 2027 2028 static int 2029 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2030 { 2031 struct lofi_state *lsp; 2032 int instance = ddi_get_instance(dip); 2033 2034 if (cmd != DDI_DETACH) 2035 return (DDI_FAILURE); 2036 2037 /* 2038 * If the instance is not 0, release state. 2039 * The instance 0 is control device, we can not detach it 2040 * before other instances are detached. 2041 */ 2042 if (instance != 0) { 2043 lsp = ddi_get_soft_state(lofi_statep, instance); 2044 if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) { 2045 ddi_soft_state_free(lofi_statep, instance); 2046 return (DDI_SUCCESS); 2047 } else 2048 return (DDI_FAILURE); 2049 } 2050 mutex_enter(&lofi_lock); 2051 2052 if (!list_is_empty(&lofi_list)) { 2053 mutex_exit(&lofi_lock); 2054 return (DDI_FAILURE); 2055 } 2056 2057 ddi_remove_minor_node(dip, NULL); 2058 ddi_prop_remove_all(dip); 2059 2060 mutex_exit(&lofi_lock); 2061 2062 if (zone_key_delete(lofi_zone_key) != 0) 2063 cmn_err(CE_WARN, "failed to delete zone key"); 2064 2065 ddi_soft_state_free(lofi_statep, 0); 2066 2067 return (DDI_SUCCESS); 2068 } 2069 2070 /* 2071 * With the addition of encryption, we must be careful that encryption key is 2072 * wiped before kernel's data structures are freed so it cannot accidentally 2073 * slip out to userland through uninitialized data elsewhere. 2074 */ 2075 static void 2076 free_lofi_ioctl(struct lofi_ioctl *klip) 2077 { 2078 /* Make sure this encryption key doesn't stick around */ 2079 bzero(klip->li_key, sizeof (klip->li_key)); 2080 kmem_free(klip, sizeof (struct lofi_ioctl)); 2081 } 2082 2083 /* 2084 * These two functions simplify the rest of the ioctls that need to copyin/out 2085 * the lofi_ioctl structure. 2086 */ 2087 int 2088 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp, 2089 int flag) 2090 { 2091 struct lofi_ioctl *klip; 2092 int error; 2093 2094 klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 2095 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 2096 if (error) 2097 goto err; 2098 2099 /* ensure NULL termination */ 2100 klip->li_filename[MAXPATHLEN-1] = '\0'; 2101 klip->li_devpath[MAXPATHLEN-1] = '\0'; 2102 klip->li_algorithm[MAXALGLEN-1] = '\0'; 2103 klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2104 klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2105 2106 if (klip->li_id > L_MAXMIN32) { 2107 error = EINVAL; 2108 goto err; 2109 } 2110 2111 return (0); 2112 2113 err: 2114 free_lofi_ioctl(klip); 2115 return (error); 2116 } 2117 2118 int 2119 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 2120 int flag) 2121 { 2122 int error; 2123 2124 /* 2125 * NOTE: Do NOT copy the crypto_key_t "back" to userland. 2126 * This ensures that an attacker can't trivially find the 2127 * key for a mapping just by issuing the ioctl. 2128 * 2129 * It can still be found by poking around in kmem with mdb(1), 2130 * but there is no point in making it easy when the info isn't 2131 * of any use in this direction anyway. 2132 * 2133 * Either way we don't actually have the raw key stored in 2134 * a form that we can get it anyway, since we just used it 2135 * to create a ctx template and didn't keep "the original". 2136 */ 2137 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 2138 if (error) 2139 return (EFAULT); 2140 return (0); 2141 } 2142 2143 static int 2144 lofi_access(struct lofi_state *lsp) 2145 { 2146 ASSERT(MUTEX_HELD(&lofi_lock)); 2147 if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone) 2148 return (0); 2149 return (EPERM); 2150 } 2151 2152 /* 2153 * Find the lofi state for the given filename. We compare by vnode to 2154 * allow the global zone visibility into NGZ lofi nodes. 2155 */ 2156 static int 2157 file_to_lofi_nocheck(char *filename, boolean_t readonly, 2158 struct lofi_state **lspp) 2159 { 2160 struct lofi_state *lsp; 2161 vnode_t *vp = NULL; 2162 int err = 0; 2163 int rdfiles = 0; 2164 2165 ASSERT(MUTEX_HELD(&lofi_lock)); 2166 2167 if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW, 2168 NULLVPP, &vp)) != 0) 2169 goto out; 2170 2171 if (vp->v_type == VREG) { 2172 vnode_t *realvp; 2173 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2174 VN_HOLD(realvp); 2175 VN_RELE(vp); 2176 vp = realvp; 2177 } 2178 } 2179 2180 for (lsp = list_head(&lofi_list); lsp != NULL; 2181 lsp = list_next(&lofi_list, lsp)) { 2182 if (lsp->ls_vp == vp) { 2183 if (lspp != NULL) 2184 *lspp = lsp; 2185 if (lsp->ls_readonly) { 2186 rdfiles++; 2187 /* Skip if '-r' is specified */ 2188 if (readonly) 2189 continue; 2190 } 2191 goto out; 2192 } 2193 } 2194 2195 err = ENOENT; 2196 2197 /* 2198 * If a filename is given as an argument for lofi_unmap, we shouldn't 2199 * allow unmap if there are multiple read-only lofi devices associated 2200 * with this file. 2201 */ 2202 if (lspp != NULL) { 2203 if (rdfiles == 1) 2204 err = 0; 2205 else if (rdfiles > 1) 2206 err = EBUSY; 2207 } 2208 2209 out: 2210 if (vp != NULL) 2211 VN_RELE(vp); 2212 return (err); 2213 } 2214 2215 /* 2216 * Find the minor for the given filename, checking the zone can access 2217 * it. 2218 */ 2219 static int 2220 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp) 2221 { 2222 int err = 0; 2223 2224 ASSERT(MUTEX_HELD(&lofi_lock)); 2225 2226 if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0) 2227 return (err); 2228 2229 if ((err = lofi_access(*lspp)) != 0) 2230 return (err); 2231 2232 return (0); 2233 } 2234 2235 /* 2236 * Fakes up a disk geometry based on the size of the file. This is needed 2237 * to support newfs on traditional lofi device, but also will provide 2238 * geometry hint for cmlb. 2239 */ 2240 static void 2241 fake_disk_geometry(struct lofi_state *lsp) 2242 { 2243 u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset; 2244 2245 /* dk_geom - see dkio(7I) */ 2246 /* 2247 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 2248 * of sectors), but that breaks programs like fdisk which want to 2249 * partition a disk by cylinder. With one cylinder, you can't create 2250 * an fdisk partition and put pcfs on it for testing (hard to pick 2251 * a number between one and one). 2252 * 2253 * The cheezy floppy test is an attempt to not have too few cylinders 2254 * for a small file, or so many on a big file that you waste space 2255 * for backup superblocks or cylinder group structures. 2256 */ 2257 bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg)); 2258 if (dsize < (2 * 1024 * 1024)) /* floppy? */ 2259 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024); 2260 else 2261 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024); 2262 /* in case file file is < 100k */ 2263 if (lsp->ls_dkg.dkg_ncyl == 0) 2264 lsp->ls_dkg.dkg_ncyl = 1; 2265 2266 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl; 2267 lsp->ls_dkg.dkg_nhead = 1; 2268 lsp->ls_dkg.dkg_rpm = 7200; 2269 2270 lsp->ls_dkg.dkg_nsect = dsize / 2271 (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift); 2272 } 2273 2274 /* 2275 * build vtoc - see dkio(7I) 2276 * 2277 * Fakes one big partition based on the size of the file. This is needed 2278 * because we allow newfs'ing the traditional lofi device and newfs will 2279 * do several disk ioctls to figure out the geometry and partition information. 2280 * It uses that information to determine the parameters to pass to mkfs. 2281 */ 2282 static void 2283 fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt) 2284 { 2285 bzero(vt, sizeof (struct vtoc)); 2286 vt->v_sanity = VTOC_SANE; 2287 vt->v_version = V_VERSION; 2288 (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME, 2289 sizeof (vt->v_volume)); 2290 vt->v_sectorsz = 1 << lsp->ls_pbshift; 2291 vt->v_nparts = 1; 2292 vt->v_part[0].p_tag = V_UNASSIGNED; 2293 2294 /* 2295 * A compressed file is read-only, other files can 2296 * be read-write 2297 */ 2298 if (lsp->ls_uncomp_seg_sz > 0) { 2299 vt->v_part[0].p_flag = V_UNMNT | V_RONLY; 2300 } else { 2301 vt->v_part[0].p_flag = V_UNMNT; 2302 } 2303 vt->v_part[0].p_start = (daddr_t)0; 2304 /* 2305 * The partition size cannot just be the number of sectors, because 2306 * that might not end on a cylinder boundary. And if that's the case, 2307 * newfs/mkfs will print a scary warning. So just figure the size 2308 * based on the number of cylinders and sectors/cylinder. 2309 */ 2310 vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 2311 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 2312 } 2313 2314 /* 2315 * build dk_cinfo - see dkio(7I) 2316 */ 2317 static void 2318 fake_disk_info(dev_t dev, struct dk_cinfo *ci) 2319 { 2320 bzero(ci, sizeof (struct dk_cinfo)); 2321 (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname)); 2322 ci->dki_ctype = DKC_SCSI_CCS; 2323 (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname)); 2324 ci->dki_unit = LOFI_MINOR2ID(getminor(dev)); 2325 ci->dki_partition = LOFI_PART(getminor(dev)); 2326 /* 2327 * newfs uses this to set maxcontig. Must not be < 16, or it 2328 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 2329 * it by the block size. Then tunefs doesn't work because 2330 * maxcontig is 0. 2331 */ 2332 ci->dki_maxtransfer = 16; 2333 } 2334 2335 /* 2336 * map in a compressed file 2337 * 2338 * Read in the header and the index that follows. 2339 * 2340 * The header is as follows - 2341 * 2342 * Signature (name of the compression algorithm) 2343 * Compression segment size (a multiple of 512) 2344 * Number of index entries 2345 * Size of the last block 2346 * The array containing the index entries 2347 * 2348 * The header information is always stored in 2349 * network byte order on disk. 2350 */ 2351 static int 2352 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 2353 { 2354 uint32_t index_sz, header_len, i; 2355 ssize_t resid; 2356 enum uio_rw rw; 2357 char *tbuf = buf; 2358 int error; 2359 2360 /* The signature has already been read */ 2361 tbuf += sizeof (lsp->ls_comp_algorithm); 2362 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 2363 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 2364 2365 /* 2366 * The compressed segment size must be a power of 2 2367 */ 2368 if (lsp->ls_uncomp_seg_sz < DEV_BSIZE || 2369 !ISP2(lsp->ls_uncomp_seg_sz)) 2370 return (EINVAL); 2371 2372 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 2373 ; 2374 2375 lsp->ls_comp_seg_shift = i; 2376 2377 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 2378 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 2379 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 2380 2381 tbuf += sizeof (lsp->ls_comp_index_sz); 2382 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 2383 sizeof (lsp->ls_uncomp_last_seg_sz)); 2384 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 2385 2386 /* 2387 * Compute the total size of the uncompressed data 2388 * for use in fake_disk_geometry and other calculations. 2389 * Disk geometry has to be faked with respect to the 2390 * actual uncompressed data size rather than the 2391 * compressed file size. 2392 */ 2393 lsp->ls_vp_size = 2394 (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 2395 + lsp->ls_uncomp_last_seg_sz; 2396 2397 /* 2398 * Index size is rounded up to DEV_BSIZE for ease 2399 * of segmapping 2400 */ 2401 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 2402 header_len = sizeof (lsp->ls_comp_algorithm) + 2403 sizeof (lsp->ls_uncomp_seg_sz) + 2404 sizeof (lsp->ls_comp_index_sz) + 2405 sizeof (lsp->ls_uncomp_last_seg_sz); 2406 lsp->ls_comp_offbase = header_len + index_sz; 2407 2408 index_sz += header_len; 2409 index_sz = roundup(index_sz, DEV_BSIZE); 2410 2411 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 2412 lsp->ls_comp_index_data_sz = index_sz; 2413 2414 /* 2415 * Read in the index -- this has a side-effect 2416 * of reading in the header as well 2417 */ 2418 rw = UIO_READ; 2419 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 2420 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2421 2422 if (error != 0) 2423 return (error); 2424 2425 /* Skip the header, this is where the index really begins */ 2426 lsp->ls_comp_seg_index = 2427 /*LINTED*/ 2428 (uint64_t *)(lsp->ls_comp_index_data + header_len); 2429 2430 /* 2431 * Now recompute offsets in the index to account for 2432 * the header length 2433 */ 2434 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 2435 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 2436 BE_64(lsp->ls_comp_seg_index[i]); 2437 } 2438 2439 return (error); 2440 } 2441 2442 static int 2443 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip) 2444 { 2445 struct crypto_meta chead; 2446 char buf[DEV_BSIZE]; 2447 ssize_t resid; 2448 char *marker; 2449 int error; 2450 int ret; 2451 int i; 2452 2453 if (!klip->li_crypto_enabled) 2454 return (0); 2455 2456 /* 2457 * All current algorithms have a max of 448 bits. 2458 */ 2459 if (klip->li_iv_len > CRYPTO_BITS2BYTES(512)) 2460 return (EINVAL); 2461 2462 if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key)) 2463 return (EINVAL); 2464 2465 lsp->ls_crypto_enabled = klip->li_crypto_enabled; 2466 2467 mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL); 2468 2469 lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher); 2470 if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) { 2471 cmn_err(CE_WARN, "invalid cipher %s requested for %s", 2472 klip->li_cipher, klip->li_filename); 2473 return (EINVAL); 2474 } 2475 2476 /* this is just initialization here */ 2477 lsp->ls_mech.cm_param = NULL; 2478 lsp->ls_mech.cm_param_len = 0; 2479 2480 lsp->ls_iv_type = klip->li_iv_type; 2481 lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher); 2482 if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) { 2483 cmn_err(CE_WARN, "invalid iv cipher %s requested" 2484 " for %s", klip->li_iv_cipher, klip->li_filename); 2485 return (EINVAL); 2486 } 2487 2488 /* iv mech must itself take a null iv */ 2489 lsp->ls_iv_mech.cm_param = NULL; 2490 lsp->ls_iv_mech.cm_param_len = 0; 2491 lsp->ls_iv_len = klip->li_iv_len; 2492 2493 /* 2494 * Create ctx using li_cipher & the raw li_key after checking 2495 * that it isn't a weak key. 2496 */ 2497 lsp->ls_key.ck_format = CRYPTO_KEY_RAW; 2498 lsp->ls_key.ck_length = klip->li_key_len; 2499 lsp->ls_key.ck_data = kmem_alloc( 2500 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP); 2501 bcopy(klip->li_key, lsp->ls_key.ck_data, 2502 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 2503 2504 ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key); 2505 if (ret != CRYPTO_SUCCESS) { 2506 cmn_err(CE_WARN, "weak key check failed for cipher " 2507 "%s on file %s (0x%x)", klip->li_cipher, 2508 klip->li_filename, ret); 2509 return (EINVAL); 2510 } 2511 2512 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 2513 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2514 if (error != 0) 2515 return (error); 2516 2517 /* 2518 * This is the case where the header in the lofi image is already 2519 * initialized to indicate it is encrypted. 2520 */ 2521 if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) { 2522 /* 2523 * The encryption header information is laid out this way: 2524 * 6 bytes: hex "CFLOFI" 2525 * 2 bytes: version = 0 ... for now 2526 * 96 bytes: reserved1 (not implemented yet) 2527 * 4 bytes: data_sector = 2 ... for now 2528 * more... not implemented yet 2529 */ 2530 2531 marker = buf; 2532 2533 /* copy the magic */ 2534 bcopy(marker, lsp->ls_crypto.magic, 2535 sizeof (lsp->ls_crypto.magic)); 2536 marker += sizeof (lsp->ls_crypto.magic); 2537 2538 /* read the encryption version number */ 2539 bcopy(marker, &(lsp->ls_crypto.version), 2540 sizeof (lsp->ls_crypto.version)); 2541 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version); 2542 marker += sizeof (lsp->ls_crypto.version); 2543 2544 /* read a chunk of reserved data */ 2545 bcopy(marker, lsp->ls_crypto.reserved1, 2546 sizeof (lsp->ls_crypto.reserved1)); 2547 marker += sizeof (lsp->ls_crypto.reserved1); 2548 2549 /* read block number where encrypted data begins */ 2550 bcopy(marker, &(lsp->ls_crypto.data_sector), 2551 sizeof (lsp->ls_crypto.data_sector)); 2552 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector); 2553 marker += sizeof (lsp->ls_crypto.data_sector); 2554 2555 /* and ignore the rest until it is implemented */ 2556 2557 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2558 return (0); 2559 } 2560 2561 /* 2562 * We've requested encryption, but no magic was found, so it must be 2563 * a new image. 2564 */ 2565 2566 for (i = 0; i < sizeof (struct crypto_meta); i++) { 2567 if (buf[i] != '\0') 2568 return (EINVAL); 2569 } 2570 2571 marker = buf; 2572 bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic)); 2573 marker += sizeof (lofi_crypto_magic); 2574 chead.version = htons(LOFI_CRYPTO_VERSION); 2575 bcopy(&(chead.version), marker, sizeof (chead.version)); 2576 marker += sizeof (chead.version); 2577 marker += sizeof (chead.reserved1); 2578 chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR); 2579 bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector)); 2580 2581 /* write the header */ 2582 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE, 2583 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2584 if (error != 0) 2585 return (error); 2586 2587 /* fix things up so it looks like we read this info */ 2588 bcopy(lofi_crypto_magic, lsp->ls_crypto.magic, 2589 sizeof (lofi_crypto_magic)); 2590 lsp->ls_crypto.version = LOFI_CRYPTO_VERSION; 2591 lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR; 2592 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2593 return (0); 2594 } 2595 2596 /* 2597 * Check to see if the passed in signature is a valid one. If it is 2598 * valid, return the index into lofi_compress_table. 2599 * 2600 * Return -1 if it is invalid 2601 */ 2602 static int 2603 lofi_compress_select(const char *signature) 2604 { 2605 int i; 2606 2607 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 2608 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 2609 return (i); 2610 } 2611 2612 return (-1); 2613 } 2614 2615 static int 2616 lofi_init_compress(struct lofi_state *lsp) 2617 { 2618 char buf[DEV_BSIZE]; 2619 int compress_index; 2620 ssize_t resid; 2621 int error; 2622 2623 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 2624 0, RLIM64_INFINITY, kcred, &resid); 2625 2626 if (error != 0) 2627 return (error); 2628 2629 if ((compress_index = lofi_compress_select(buf)) == -1) 2630 return (0); 2631 2632 /* compression and encryption are mutually exclusive */ 2633 if (lsp->ls_crypto_enabled) 2634 return (ENOTSUP); 2635 2636 /* initialize compression info for compressed lofi */ 2637 lsp->ls_comp_algorithm_index = compress_index; 2638 (void) strlcpy(lsp->ls_comp_algorithm, 2639 lofi_compress_table[compress_index].l_name, 2640 sizeof (lsp->ls_comp_algorithm)); 2641 2642 /* Finally setup per-thread pre-allocated buffers */ 2643 lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads * 2644 sizeof (struct compbuf), KM_SLEEP); 2645 2646 return (lofi_map_compressed_file(lsp, buf)); 2647 } 2648 2649 /* 2650 * Allocate new or proposed id from lofi_id. 2651 * 2652 * Special cases for proposed id: 2653 * 0: not allowed, 0 is id for control device. 2654 * -1: allocate first usable id from lofi_id. 2655 * any other value is proposed value from userland 2656 * 2657 * returns DDI_SUCCESS or errno. 2658 */ 2659 static int 2660 lofi_alloc_id(int *idp) 2661 { 2662 int id, error = DDI_SUCCESS; 2663 2664 if (*idp == -1) { 2665 id = id_allocff_nosleep(lofi_id); 2666 if (id == -1) { 2667 error = EAGAIN; 2668 goto err; 2669 } 2670 } else if (*idp == 0) { 2671 error = EINVAL; 2672 goto err; 2673 } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) { 2674 error = ERANGE; 2675 goto err; 2676 } else { 2677 if (ddi_get_soft_state(lofi_statep, *idp) != NULL) { 2678 error = EEXIST; 2679 goto err; 2680 } 2681 2682 id = id_alloc_specific_nosleep(lofi_id, *idp); 2683 if (id == -1) { 2684 error = EAGAIN; 2685 goto err; 2686 } 2687 } 2688 *idp = id; 2689 err: 2690 return (error); 2691 } 2692 2693 static int 2694 lofi_create_dev(struct lofi_ioctl *klip) 2695 { 2696 dev_info_t *parent, *child; 2697 struct lofi_state *lsp = NULL; 2698 char namebuf[MAXNAMELEN]; 2699 int error, circ; 2700 2701 /* get control device */ 2702 lsp = ddi_get_soft_state(lofi_statep, 0); 2703 parent = ddi_get_parent(lsp->ls_dip); 2704 2705 if ((error = lofi_alloc_id((int *)&klip->li_id))) 2706 return (error); 2707 2708 (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d", 2709 klip->li_id); 2710 2711 ndi_devi_enter(parent, &circ); 2712 child = ndi_devi_findchild(parent, namebuf); 2713 ndi_devi_exit(parent, circ); 2714 2715 if (child == NULL) { 2716 child = ddi_add_child(parent, LOFI_DRIVER_NAME, 2717 (pnode_t)DEVI_SID_NODEID, klip->li_id); 2718 if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child, 2719 "instance", klip->li_id)) != DDI_PROP_SUCCESS) 2720 goto err; 2721 2722 if (klip->li_labeled == B_TRUE) { 2723 if ((error = ddi_prop_create(DDI_DEV_T_NONE, child, 2724 DDI_PROP_CANSLEEP, "labeled", 0, 0)) 2725 != DDI_PROP_SUCCESS) 2726 goto err; 2727 } 2728 2729 if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH)) 2730 != NDI_SUCCESS) 2731 goto err; 2732 } else { 2733 id_free(lofi_id, klip->li_id); 2734 error = EEXIST; 2735 return (error); 2736 } 2737 2738 goto done; 2739 2740 err: 2741 ddi_prop_remove_all(child); 2742 (void) ndi_devi_offline(child, NDI_DEVI_REMOVE); 2743 id_free(lofi_id, klip->li_id); 2744 done: 2745 2746 return (error); 2747 } 2748 2749 static void 2750 lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq) 2751 { 2752 char *p = NULL; 2753 2754 (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid)); 2755 2756 mutex_enter(&lsp->ls_vp_lock); 2757 if (lsp->ls_vp != NULL) 2758 p = strrchr(lsp->ls_vp->v_path, '/'); 2759 if (p != NULL) 2760 (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid)); 2761 mutex_exit(&lsp->ls_vp_lock); 2762 (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision)); 2763 } 2764 2765 /* 2766 * copy devlink name from event cache 2767 */ 2768 static void 2769 lofi_copy_devpath(struct lofi_ioctl *klip) 2770 { 2771 int error; 2772 char namebuf[MAXNAMELEN], *str; 2773 clock_t ticks; 2774 nvlist_t *nvl = NULL; 2775 2776 if (klip->li_labeled == B_TRUE) 2777 klip->li_devpath[0] = '\0'; 2778 else { 2779 /* no need to wait for messages */ 2780 (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath), 2781 "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id); 2782 return; 2783 } 2784 2785 (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id); 2786 2787 mutex_enter(&lofi_devlink_cache.ln_lock); 2788 do { 2789 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, 2790 namebuf, &nvl); 2791 2792 if (error != 0) { 2793 /* No data in cache, wait for some. */ 2794 ticks = ddi_get_lbolt() + 2795 lofi_timeout * drv_usectohz(1000000); 2796 error = cv_timedwait(&lofi_devlink_cache.ln_cv, 2797 &lofi_devlink_cache.ln_lock, ticks); 2798 if (error == -1) 2799 break; /* timeout */ 2800 error = 1; 2801 continue; /* Read again. */ 2802 } 2803 2804 if (nvl != NULL) { 2805 if (nvlist_lookup_string(nvl, DEV_NAME, &str) == 0) { 2806 if (strncmp(str, "/dev/" LOFI_CHAR_NAME, 2807 sizeof ("/dev/" LOFI_CHAR_NAME) - 1) == 0) { 2808 error = 1; 2809 continue; 2810 } 2811 (void) strlcpy(klip->li_devpath, str, 2812 sizeof (klip->li_devpath)); 2813 } 2814 } 2815 } while (error != 0); 2816 mutex_exit(&lofi_devlink_cache.ln_lock); 2817 } 2818 2819 /* 2820 * map a file to a minor number. Return the minor number. 2821 */ 2822 static int 2823 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 2824 int *rvalp, struct cred *credp, int ioctl_flag) 2825 { 2826 int id = -1; 2827 struct lofi_state *lsp = NULL; 2828 struct lofi_ioctl *klip; 2829 int error; 2830 struct vnode *vp = NULL; 2831 vattr_t vattr; 2832 int flag; 2833 char namebuf[MAXNAMELEN]; 2834 2835 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2836 if (error != 0) 2837 return (error); 2838 2839 mutex_enter(&lofi_lock); 2840 2841 if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly, 2842 NULL) == 0) { 2843 error = EBUSY; 2844 goto err; 2845 } 2846 2847 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 2848 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 2849 if (error) { 2850 /* try read-only */ 2851 flag &= ~FWRITE; 2852 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 2853 &vp, 0, 0); 2854 if (error) 2855 goto err; 2856 } 2857 2858 if (!V_ISLOFIABLE(vp->v_type)) { 2859 error = EINVAL; 2860 goto err; 2861 } 2862 2863 vattr.va_mask = AT_SIZE; 2864 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 2865 if (error) 2866 goto err; 2867 2868 /* the file needs to be a multiple of the block size */ 2869 if ((vattr.va_size % DEV_BSIZE) != 0) { 2870 error = EINVAL; 2871 goto err; 2872 } 2873 2874 if (pickminor) { 2875 klip->li_id = (uint32_t)-1; 2876 } 2877 if ((error = lofi_create_dev(klip)) != 0) 2878 goto err; 2879 2880 id = klip->li_id; 2881 lsp = ddi_get_soft_state(lofi_statep, id); 2882 if (lsp == NULL) 2883 goto err; 2884 2885 /* 2886 * from this point lofi_destroy() is used to clean up on error 2887 * make sure the basic data is set 2888 */ 2889 list_insert_tail(&lofi_list, lsp); 2890 lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id)); 2891 2892 list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache), 2893 offsetof(struct lofi_comp_cache, lc_list)); 2894 2895 /* 2896 * save open mode so file can be closed properly and vnode counts 2897 * updated correctly. 2898 */ 2899 lsp->ls_openflag = flag; 2900 2901 lsp->ls_vp = vp; 2902 lsp->ls_stacked_vp = vp; 2903 2904 lsp->ls_vp_size = vattr.va_size; 2905 lsp->ls_vp_comp_size = lsp->ls_vp_size; 2906 2907 /* 2908 * Try to handle stacked lofs vnodes. 2909 */ 2910 if (vp->v_type == VREG) { 2911 vnode_t *realvp; 2912 2913 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2914 /* 2915 * We need to use the realvp for uniqueness 2916 * checking, but keep the stacked vp for 2917 * LOFI_GET_FILENAME display. 2918 */ 2919 VN_HOLD(realvp); 2920 lsp->ls_vp = realvp; 2921 } 2922 } 2923 2924 lsp->ls_lbshift = highbit(DEV_BSIZE) - 1; 2925 lsp->ls_pbshift = lsp->ls_lbshift; 2926 2927 lsp->ls_readonly = klip->li_readonly; 2928 lsp->ls_uncomp_seg_sz = 0; 2929 lsp->ls_comp_algorithm[0] = '\0'; 2930 lsp->ls_crypto_offset = 0; 2931 2932 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 2933 LOFI_DRIVER_NAME, id); 2934 lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads, 2935 minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0); 2936 2937 if ((error = lofi_init_crypto(lsp, klip)) != 0) 2938 goto err; 2939 2940 if ((error = lofi_init_compress(lsp)) != 0) 2941 goto err; 2942 2943 fake_disk_geometry(lsp); 2944 2945 /* For unlabeled lofi add Nblocks and Size */ 2946 if (klip->li_labeled == B_FALSE) { 2947 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2948 SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset); 2949 if (error != DDI_PROP_SUCCESS) { 2950 error = EINVAL; 2951 goto err; 2952 } 2953 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2954 NBLOCKS_PROP_NAME, 2955 (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE); 2956 if (error != DDI_PROP_SUCCESS) { 2957 error = EINVAL; 2958 goto err; 2959 } 2960 } 2961 2962 /* 2963 * Notify we are ready to rock. 2964 */ 2965 mutex_enter(&lsp->ls_vp_lock); 2966 lsp->ls_vp_ready = B_TRUE; 2967 cv_broadcast(&lsp->ls_vp_cv); 2968 mutex_exit(&lsp->ls_vp_lock); 2969 mutex_exit(&lofi_lock); 2970 2971 lofi_copy_devpath(klip); 2972 2973 if (rvalp) 2974 *rvalp = id; 2975 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2976 free_lofi_ioctl(klip); 2977 return (0); 2978 2979 err: 2980 if (lsp != NULL) { 2981 lofi_destroy(lsp, credp); 2982 } else { 2983 if (vp != NULL) { 2984 (void) VOP_PUTPAGE(vp, 0, 0, B_FREE, credp, NULL); 2985 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 2986 VN_RELE(vp); 2987 } 2988 } 2989 2990 mutex_exit(&lofi_lock); 2991 free_lofi_ioctl(klip); 2992 return (error); 2993 } 2994 2995 /* 2996 * unmap a file. 2997 */ 2998 static int 2999 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename, 3000 struct cred *credp, int ioctl_flag) 3001 { 3002 struct lofi_state *lsp; 3003 struct lofi_ioctl *klip; 3004 char namebuf[MAXNAMELEN]; 3005 int err; 3006 3007 err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3008 if (err != 0) 3009 return (err); 3010 3011 mutex_enter(&lofi_lock); 3012 if (byfilename) { 3013 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly, 3014 &lsp)) != 0) { 3015 goto done; 3016 } 3017 } else if (klip->li_id == 0) { 3018 err = ENXIO; 3019 goto done; 3020 } else { 3021 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3022 } 3023 3024 if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) { 3025 err = ENXIO; 3026 goto done; 3027 } 3028 3029 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3030 (void) snprintf(namebuf, sizeof (namebuf), "%u", klip->li_id); 3031 3032 /* 3033 * If it's still held open, we'll do one of three things: 3034 * 3035 * If no flag is set, just return EBUSY. 3036 * 3037 * If the 'cleanup' flag is set, unmap and remove the device when 3038 * the last user finishes. 3039 * 3040 * If the 'force' flag is set, then we forcibly close the underlying 3041 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 3042 * will return DKIO_DEV_GONE. When the device is last closed, the 3043 * device will be cleaned up appropriately. 3044 * 3045 * This is complicated by the fact that we may have outstanding 3046 * dispatched I/Os. Rather than having a single mutex to serialize all 3047 * I/O, we keep a count of the number of outstanding I/O requests 3048 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os 3049 * should be dispatched (ls_vp_closereq). 3050 * 3051 * We set the flag, wait for the number of outstanding I/Os to reach 0, 3052 * and then close the underlying vnode. 3053 */ 3054 if (is_opened(lsp)) { 3055 if (klip->li_force) { 3056 /* Mark the device for cleanup. */ 3057 lofi_set_cleanup(lsp); 3058 mutex_enter(&lsp->ls_vp_lock); 3059 lsp->ls_vp_closereq = B_TRUE; 3060 /* Wake up any threads waiting on dkiocstate. */ 3061 cv_broadcast(&lsp->ls_vp_cv); 3062 while (lsp->ls_vp_iocount > 0) 3063 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 3064 mutex_exit(&lsp->ls_vp_lock); 3065 } else if (klip->li_cleanup) { 3066 lofi_set_cleanup(lsp); 3067 } else { 3068 err = EBUSY; 3069 } 3070 } else { 3071 lofi_free_dev(lsp); 3072 lofi_destroy(lsp, credp); 3073 } 3074 3075 /* Remove name from devlink cache */ 3076 mutex_enter(&lofi_devlink_cache.ln_lock); 3077 (void) nvlist_remove_all(lofi_devlink_cache.ln_data, namebuf); 3078 cv_broadcast(&lofi_devlink_cache.ln_cv); 3079 mutex_exit(&lofi_devlink_cache.ln_lock); 3080 done: 3081 mutex_exit(&lofi_lock); 3082 if (err == 0) 3083 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3084 free_lofi_ioctl(klip); 3085 return (err); 3086 } 3087 3088 /* 3089 * get the filename given the minor number, or the minor number given 3090 * the name. 3091 */ 3092 /*ARGSUSED*/ 3093 static int 3094 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 3095 struct cred *credp, int ioctl_flag) 3096 { 3097 struct lofi_ioctl *klip; 3098 struct lofi_state *lsp; 3099 int error; 3100 3101 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3102 if (error != 0) 3103 return (error); 3104 3105 switch (which) { 3106 case LOFI_GET_FILENAME: 3107 if (klip->li_id == 0) { 3108 free_lofi_ioctl(klip); 3109 return (EINVAL); 3110 } 3111 3112 mutex_enter(&lofi_lock); 3113 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3114 if (lsp == NULL || lofi_access(lsp) != 0) { 3115 mutex_exit(&lofi_lock); 3116 free_lofi_ioctl(klip); 3117 return (ENXIO); 3118 } 3119 3120 /* 3121 * This may fail if, for example, we're trying to look 3122 * up a zoned NFS path from the global zone. 3123 */ 3124 if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename, 3125 sizeof (klip->li_filename), CRED()) != 0) { 3126 (void) strlcpy(klip->li_filename, "?", 3127 sizeof (klip->li_filename)); 3128 } 3129 3130 klip->li_readonly = lsp->ls_readonly; 3131 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3132 3133 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3134 sizeof (klip->li_algorithm)); 3135 klip->li_crypto_enabled = lsp->ls_crypto_enabled; 3136 mutex_exit(&lofi_lock); 3137 3138 lofi_copy_devpath(klip); 3139 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3140 free_lofi_ioctl(klip); 3141 return (error); 3142 case LOFI_GET_MINOR: 3143 mutex_enter(&lofi_lock); 3144 error = file_to_lofi(klip->li_filename, 3145 klip->li_readonly, &lsp); 3146 if (error != 0) { 3147 mutex_exit(&lofi_lock); 3148 free_lofi_ioctl(klip); 3149 return (error); 3150 } 3151 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3152 3153 klip->li_readonly = lsp->ls_readonly; 3154 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3155 mutex_exit(&lofi_lock); 3156 3157 lofi_copy_devpath(klip); 3158 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3159 3160 free_lofi_ioctl(klip); 3161 return (error); 3162 case LOFI_CHECK_COMPRESSED: 3163 mutex_enter(&lofi_lock); 3164 error = file_to_lofi(klip->li_filename, 3165 klip->li_readonly, &lsp); 3166 if (error != 0) { 3167 mutex_exit(&lofi_lock); 3168 free_lofi_ioctl(klip); 3169 return (error); 3170 } 3171 3172 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3173 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3174 sizeof (klip->li_algorithm)); 3175 3176 mutex_exit(&lofi_lock); 3177 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3178 free_lofi_ioctl(klip); 3179 return (error); 3180 default: 3181 free_lofi_ioctl(klip); 3182 return (EINVAL); 3183 } 3184 } 3185 3186 static int 3187 uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb, 3188 struct uscsi_cmd *uscmd) 3189 { 3190 int rval; 3191 3192 #ifdef _MULTI_DATAMODEL 3193 switch (ddi_model_convert_from(flag & FMODELS)) { 3194 case DDI_MODEL_ILP32: { 3195 struct uscsi_cmd32 ucmd32; 3196 3197 if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) { 3198 rval = EFAULT; 3199 goto err; 3200 } 3201 uscsi_cmd32touscsi_cmd((&ucmd32), uscmd); 3202 break; 3203 } 3204 case DDI_MODEL_NONE: 3205 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3206 rval = EFAULT; 3207 goto err; 3208 } 3209 break; 3210 default: 3211 rval = EFAULT; 3212 goto err; 3213 } 3214 #else 3215 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3216 rval = EFAULT; 3217 goto err; 3218 } 3219 #endif /* _MULTI_DATAMODEL */ 3220 if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) { 3221 rval = EFAULT; 3222 goto err; 3223 } 3224 if (cdb->scc_cmd == SCMD_INQUIRY) { 3225 return (0); 3226 } 3227 err: 3228 return (rval); 3229 } 3230 3231 static int 3232 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 3233 int *rvalp) 3234 { 3235 int error; 3236 enum dkio_state dkstate; 3237 struct lofi_state *lsp; 3238 dk_efi_t user_efi; 3239 int id; 3240 3241 id = LOFI_MINOR2ID(getminor(dev)); 3242 3243 /* lofi ioctls only apply to the master device */ 3244 if (id == 0) { 3245 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 3246 3247 /* 3248 * the query command only need read-access - i.e., normal 3249 * users are allowed to do those on the ctl device as 3250 * long as they can open it read-only. 3251 */ 3252 switch (cmd) { 3253 case LOFI_MAP_FILE: 3254 if ((flag & FWRITE) == 0) 3255 return (EPERM); 3256 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 3257 case LOFI_MAP_FILE_MINOR: 3258 if ((flag & FWRITE) == 0) 3259 return (EPERM); 3260 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 3261 case LOFI_UNMAP_FILE: 3262 if ((flag & FWRITE) == 0) 3263 return (EPERM); 3264 return (lofi_unmap_file(lip, 1, credp, flag)); 3265 case LOFI_UNMAP_FILE_MINOR: 3266 if ((flag & FWRITE) == 0) 3267 return (EPERM); 3268 return (lofi_unmap_file(lip, 0, credp, flag)); 3269 case LOFI_GET_FILENAME: 3270 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 3271 credp, flag)); 3272 case LOFI_GET_MINOR: 3273 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 3274 credp, flag)); 3275 3276 /* 3277 * This API made limited sense when this value was fixed 3278 * at LOFI_MAX_FILES. However, its use to iterate 3279 * across all possible devices in lofiadm means we don't 3280 * want to return L_MAXMIN, but the highest 3281 * *allocated* id. 3282 */ 3283 case LOFI_GET_MAXMINOR: 3284 id = 0; 3285 3286 mutex_enter(&lofi_lock); 3287 3288 for (lsp = list_head(&lofi_list); lsp != NULL; 3289 lsp = list_next(&lofi_list, lsp)) { 3290 int i; 3291 if (lofi_access(lsp) != 0) 3292 continue; 3293 3294 i = ddi_get_instance(lsp->ls_dip); 3295 if (i > id) 3296 id = i; 3297 } 3298 3299 mutex_exit(&lofi_lock); 3300 3301 error = ddi_copyout(&id, &lip->li_id, 3302 sizeof (id), flag); 3303 if (error) 3304 return (EFAULT); 3305 return (0); 3306 3307 case LOFI_CHECK_COMPRESSED: 3308 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 3309 credp, flag)); 3310 default: 3311 return (EINVAL); 3312 } 3313 } 3314 3315 mutex_enter(&lofi_lock); 3316 lsp = ddi_get_soft_state(lofi_statep, id); 3317 if (lsp == NULL || lsp->ls_cleanup) { 3318 mutex_exit(&lofi_lock); 3319 return (ENXIO); 3320 } 3321 mutex_exit(&lofi_lock); 3322 3323 if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS, 3324 "labeled") == 1) { 3325 error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag, 3326 credp, rvalp, 0); 3327 if (error != ENOTTY) 3328 return (error); 3329 } 3330 3331 /* 3332 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 3333 * EIO as if the device was no longer present. 3334 */ 3335 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 3336 return (EIO); 3337 3338 /* these are for faking out utilities like newfs */ 3339 switch (cmd) { 3340 case DKIOCGMEDIAINFO: 3341 case DKIOCGMEDIAINFOEXT: { 3342 struct dk_minfo_ext media_info; 3343 int shift = lsp->ls_lbshift; 3344 int size; 3345 3346 if (cmd == DKIOCGMEDIAINFOEXT) { 3347 media_info.dki_pbsize = 1U << lsp->ls_pbshift; 3348 size = sizeof (struct dk_minfo_ext); 3349 } else { 3350 size = sizeof (struct dk_minfo); 3351 } 3352 3353 media_info.dki_media_type = DK_FIXED_DISK; 3354 media_info.dki_lbsize = 1U << shift; 3355 media_info.dki_capacity = 3356 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift; 3357 3358 if (ddi_copyout(&media_info, (void *)arg, size, flag)) 3359 return (EFAULT); 3360 return (0); 3361 } 3362 case DKIOCREMOVABLE: { 3363 int i = 0; 3364 if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag)) 3365 return (EFAULT); 3366 return (0); 3367 } 3368 3369 case DKIOCGVTOC: { 3370 struct vtoc vt; 3371 fake_disk_vtoc(lsp, &vt); 3372 3373 switch (ddi_model_convert_from(flag & FMODELS)) { 3374 case DDI_MODEL_ILP32: { 3375 struct vtoc32 vtoc32; 3376 3377 vtoctovtoc32(vt, vtoc32); 3378 if (ddi_copyout(&vtoc32, (void *)arg, 3379 sizeof (struct vtoc32), flag)) 3380 return (EFAULT); 3381 break; 3382 } 3383 3384 case DDI_MODEL_NONE: 3385 if (ddi_copyout(&vt, (void *)arg, 3386 sizeof (struct vtoc), flag)) 3387 return (EFAULT); 3388 break; 3389 } 3390 return (0); 3391 } 3392 case DKIOCINFO: { 3393 struct dk_cinfo ci; 3394 fake_disk_info(dev, &ci); 3395 if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag)) 3396 return (EFAULT); 3397 return (0); 3398 } 3399 case DKIOCG_VIRTGEOM: 3400 case DKIOCG_PHYGEOM: 3401 case DKIOCGGEOM: 3402 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 3403 sizeof (struct dk_geom), flag); 3404 if (error) 3405 return (EFAULT); 3406 return (0); 3407 case DKIOCSTATE: 3408 /* 3409 * Normally, lofi devices are always in the INSERTED state. If 3410 * a device is forcefully unmapped, then the device transitions 3411 * to the DKIO_DEV_GONE state. 3412 */ 3413 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 3414 flag) != 0) 3415 return (EFAULT); 3416 3417 mutex_enter(&lsp->ls_vp_lock); 3418 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 3419 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) && 3420 !lsp->ls_cleanup) { 3421 /* 3422 * By virtue of having the device open, we know that 3423 * 'lsp' will remain valid when we return. 3424 */ 3425 if (!cv_wait_sig(&lsp->ls_vp_cv, &lsp->ls_vp_lock)) { 3426 mutex_exit(&lsp->ls_vp_lock); 3427 return (EINTR); 3428 } 3429 } 3430 3431 dkstate = (!lsp->ls_cleanup && lsp->ls_vp != NULL ? 3432 DKIO_INSERTED : DKIO_DEV_GONE); 3433 mutex_exit(&lsp->ls_vp_lock); 3434 3435 if (ddi_copyout(&dkstate, (void *)arg, 3436 sizeof (dkstate), flag) != 0) 3437 return (EFAULT); 3438 return (0); 3439 case USCSICMD: { 3440 struct uscsi_cmd uscmd; 3441 union scsi_cdb cdb; 3442 3443 if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) { 3444 struct scsi_inquiry inq = {0}; 3445 3446 lofi_create_inquiry(lsp, &inq); 3447 if (ddi_copyout(&inq, uscmd.uscsi_bufaddr, 3448 uscmd.uscsi_buflen, flag) != 0) 3449 return (EFAULT); 3450 return (0); 3451 } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) { 3452 struct scsi_capacity capacity; 3453 3454 capacity.capacity = 3455 BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >> 3456 lsp->ls_lbshift); 3457 capacity.lbasize = BE_32(1 << lsp->ls_lbshift); 3458 if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr, 3459 uscmd.uscsi_buflen, flag) != 0) 3460 return (EFAULT); 3461 return (0); 3462 } 3463 3464 uscmd.uscsi_rqstatus = 0xff; 3465 #ifdef _MULTI_DATAMODEL 3466 switch (ddi_model_convert_from(flag & FMODELS)) { 3467 case DDI_MODEL_ILP32: { 3468 struct uscsi_cmd32 ucmd32; 3469 uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32)); 3470 if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32), 3471 flag) != 0) 3472 return (EFAULT); 3473 break; 3474 } 3475 case DDI_MODEL_NONE: 3476 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), 3477 flag) != 0) 3478 return (EFAULT); 3479 break; 3480 default: 3481 return (EFAULT); 3482 } 3483 #else 3484 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0) 3485 return (EFAULT); 3486 #endif /* _MULTI_DATAMODEL */ 3487 return (0); 3488 } 3489 3490 case DKIOCGMBOOT: 3491 return (lofi_urw(lsp, FREAD, 0, 1 << lsp->ls_lbshift, 3492 arg, flag, credp)); 3493 3494 case DKIOCSMBOOT: 3495 return (lofi_urw(lsp, FWRITE, 0, 1 << lsp->ls_lbshift, 3496 arg, flag, credp)); 3497 3498 case DKIOCGETEFI: 3499 if (ddi_copyin((void *)arg, &user_efi, 3500 sizeof (dk_efi_t), flag) != 0) 3501 return (EFAULT); 3502 3503 return (lofi_urw(lsp, FREAD, 3504 user_efi.dki_lba * (1 << lsp->ls_lbshift), 3505 user_efi.dki_length, (intptr_t)user_efi.dki_data, 3506 flag, credp)); 3507 3508 case DKIOCSETEFI: 3509 if (ddi_copyin((void *)arg, &user_efi, 3510 sizeof (dk_efi_t), flag) != 0) 3511 return (EFAULT); 3512 3513 return (lofi_urw(lsp, FWRITE, 3514 user_efi.dki_lba * (1 << lsp->ls_lbshift), 3515 user_efi.dki_length, (intptr_t)user_efi.dki_data, 3516 flag, credp)); 3517 3518 default: 3519 #ifdef DEBUG 3520 cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd); 3521 #endif /* DEBUG */ 3522 return (ENOTTY); 3523 } 3524 } 3525 3526 static int 3527 lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 3528 char *name, caddr_t valuep, int *lengthp) 3529 { 3530 struct lofi_state *lsp; 3531 int rc; 3532 3533 lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip)); 3534 if (lsp == NULL) { 3535 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 3536 name, valuep, lengthp)); 3537 } 3538 3539 rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags, 3540 name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL); 3541 if (rc == DDI_PROP_SUCCESS) 3542 return (rc); 3543 3544 return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags, 3545 name, valuep, lengthp)); 3546 } 3547 3548 static struct cb_ops lofi_cb_ops = { 3549 lofi_open, /* open */ 3550 lofi_close, /* close */ 3551 lofi_strategy, /* strategy */ 3552 nodev, /* print */ 3553 nodev, /* dump */ 3554 lofi_read, /* read */ 3555 lofi_write, /* write */ 3556 lofi_ioctl, /* ioctl */ 3557 nodev, /* devmap */ 3558 nodev, /* mmap */ 3559 nodev, /* segmap */ 3560 nochpoll, /* poll */ 3561 lofi_prop_op, /* prop_op */ 3562 0, /* streamtab */ 3563 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 3564 CB_REV, 3565 lofi_aread, 3566 lofi_awrite 3567 }; 3568 3569 static struct dev_ops lofi_ops = { 3570 DEVO_REV, /* devo_rev, */ 3571 0, /* refcnt */ 3572 lofi_info, /* info */ 3573 nulldev, /* identify */ 3574 nulldev, /* probe */ 3575 lofi_attach, /* attach */ 3576 lofi_detach, /* detach */ 3577 nodev, /* reset */ 3578 &lofi_cb_ops, /* driver operations */ 3579 NULL, /* no bus operations */ 3580 NULL, /* power */ 3581 ddi_quiesce_not_needed, /* quiesce */ 3582 }; 3583 3584 static struct modldrv modldrv = { 3585 &mod_driverops, 3586 "loopback file driver", 3587 &lofi_ops, 3588 }; 3589 3590 static struct modlinkage modlinkage = { 3591 MODREV_1, 3592 &modldrv, 3593 NULL 3594 }; 3595 3596 int 3597 _init(void) 3598 { 3599 int error; 3600 3601 list_create(&lofi_list, sizeof (struct lofi_state), 3602 offsetof(struct lofi_state, ls_list)); 3603 3604 error = ddi_soft_state_init((void **)&lofi_statep, 3605 sizeof (struct lofi_state), 0); 3606 if (error) { 3607 list_destroy(&lofi_list); 3608 return (error); 3609 } 3610 3611 /* 3612 * The minor number is stored as id << LOFI_CMLB_SHIFT as 3613 * we need to reserve space for cmlb minor numbers. 3614 * This will leave out 4096 id values on 32bit kernel, which should 3615 * still suffice. 3616 */ 3617 lofi_id = id_space_create("lofi_id", 1, 3618 (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT))); 3619 3620 if (lofi_id == NULL) { 3621 ddi_soft_state_fini((void **)&lofi_statep); 3622 list_destroy(&lofi_list); 3623 return (DDI_FAILURE); 3624 } 3625 3626 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 3627 3628 error = mod_install(&modlinkage); 3629 3630 if (error) { 3631 id_space_destroy(lofi_id); 3632 mutex_destroy(&lofi_lock); 3633 ddi_soft_state_fini((void **)&lofi_statep); 3634 list_destroy(&lofi_list); 3635 } 3636 3637 return (error); 3638 } 3639 3640 int 3641 _fini(void) 3642 { 3643 int error; 3644 3645 mutex_enter(&lofi_lock); 3646 3647 if (!list_is_empty(&lofi_list)) { 3648 mutex_exit(&lofi_lock); 3649 return (EBUSY); 3650 } 3651 3652 mutex_exit(&lofi_lock); 3653 3654 error = mod_remove(&modlinkage); 3655 if (error) 3656 return (error); 3657 3658 mutex_destroy(&lofi_lock); 3659 id_space_destroy(lofi_id); 3660 ddi_soft_state_fini((void **)&lofi_statep); 3661 list_destroy(&lofi_list); 3662 3663 return (error); 3664 } 3665 3666 int 3667 _info(struct modinfo *modinfop) 3668 { 3669 return (mod_info(&modlinkage, modinfop)); 3670 } 3671