1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * Copyright 2015, Joyent, Inc.
28 */
29
30#include <sys/types.h>
31#include <sys/t_lock.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/signal.h>
38#include <sys/cred.h>
39#include <sys/user.h>
40#include <sys/buf.h>
41#include <sys/vfs.h>
42#include <sys/vfs_opreg.h>
43#include <sys/stat.h>
44#include <sys/vnode.h>
45#include <sys/mode.h>
46#include <sys/proc.h>
47#include <sys/disp.h>
48#include <sys/file.h>
49#include <sys/fcntl.h>
50#include <sys/flock.h>
51#include <sys/kmem.h>
52#include <sys/uio.h>
53#include <sys/dnlc.h>
54#include <sys/conf.h>
55#include <sys/errno.h>
56#include <sys/mman.h>
57#include <sys/fbuf.h>
58#include <sys/pathname.h>
59#include <sys/debug.h>
60#include <sys/vmsystm.h>
61#include <sys/cmn_err.h>
62#include <sys/dirent.h>
63#include <sys/errno.h>
64#include <sys/modctl.h>
65#include <sys/statvfs.h>
66#include <sys/mount.h>
67#include <sys/sunddi.h>
68#include <sys/bootconf.h>
69#include <sys/policy.h>
70
71#include <vm/hat.h>
72#include <vm/page.h>
73#include <vm/pvn.h>
74#include <vm/as.h>
75#include <vm/seg.h>
76#include <vm/seg_map.h>
77#include <vm/seg_kmem.h>
78#include <vm/seg_vn.h>
79#include <vm/rm.h>
80#include <vm/page.h>
81#include <sys/swap.h>
82
83#include <fs/fs_subr.h>
84
85#include <sys/fs/udf_volume.h>
86#include <sys/fs/udf_inode.h>
87
88static int32_t udf_open(struct vnode **,
89	int32_t, struct cred *, caller_context_t *);
90static int32_t udf_close(struct vnode *,
91	int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
92static int32_t udf_read(struct vnode *,
93	struct uio *, int32_t, struct cred *, caller_context_t *);
94static int32_t udf_write(struct vnode *,
95	struct uio *, int32_t, struct cred *, caller_context_t *);
96static int32_t udf_ioctl(struct vnode *,
97	int32_t, intptr_t, int32_t, struct cred *, int32_t *,
98	caller_context_t *);
99static int32_t udf_getattr(struct vnode *,
100	struct vattr *, int32_t, struct cred *, caller_context_t *);
101static int32_t udf_setattr(struct vnode *,
102	struct vattr *, int32_t, struct cred *, caller_context_t *);
103static int32_t udf_access(struct vnode *,
104	int32_t, int32_t, struct cred *, caller_context_t *);
105static int32_t udf_lookup(struct vnode *,
106	char *, struct vnode **, struct pathname *,
107	int32_t, struct vnode *, struct cred *,
108	caller_context_t *, int *, pathname_t *);
109static int32_t udf_create(struct vnode *,
110	char *, struct vattr *, enum vcexcl,
111	int32_t, struct vnode **, struct cred *, int32_t,
112	caller_context_t *, vsecattr_t *);
113static int32_t udf_remove(struct vnode *,
114	char *, struct cred *, caller_context_t *, int);
115static int32_t udf_link(struct vnode *,
116	struct vnode *, char *, struct cred *, caller_context_t *, int);
117static int32_t udf_rename(struct vnode *,
118	char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
119static int32_t udf_mkdir(struct vnode *,
120	char *, struct vattr *, struct vnode **, struct cred *,
121	caller_context_t *, int, vsecattr_t *);
122static int32_t udf_rmdir(struct vnode *,
123	char *, struct vnode *, struct cred *, caller_context_t *, int);
124static int32_t udf_readdir(struct vnode *,
125	struct uio *, struct cred *, int32_t *, caller_context_t *, int);
126static int32_t udf_symlink(struct vnode *,
127	char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
128static int32_t udf_readlink(struct vnode *,
129	struct uio *, struct cred *, caller_context_t *);
130static int32_t udf_fsync(struct vnode *,
131	int32_t, struct cred *, caller_context_t *);
132static void udf_inactive(struct vnode *,
133	struct cred *, caller_context_t *);
134static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
135static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
136static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
137static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
138	caller_context_t *);
139static int32_t udf_frlock(struct vnode *, int32_t,
140	struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
141	caller_context_t *);
142static int32_t udf_space(struct vnode *, int32_t,
143	struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
144static int32_t udf_getpage(struct vnode *, offset_t,
145	size_t, uint32_t *, struct page **, size_t,
146	struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
147static int32_t udf_putpage(struct vnode *, offset_t,
148	size_t, int32_t, struct cred *, caller_context_t *);
149static int32_t udf_map(struct vnode *, offset_t, struct as *,
150	caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
151	caller_context_t *);
152static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
153	caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
154	caller_context_t *);
155static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
156	caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
157	caller_context_t *);
158static int32_t udf_l_pathconf(struct vnode *, int32_t,
159	ulong_t *, struct cred *, caller_context_t *);
160static int32_t udf_pageio(struct vnode *, struct page *,
161	u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
162
163int32_t ud_getpage_miss(struct vnode *, u_offset_t,
164	size_t, struct seg *, caddr_t, page_t *pl[],
165	size_t, enum seg_rw, int32_t);
166void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
167int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
168int32_t ud_page_fill(struct ud_inode *, page_t *,
169	u_offset_t, uint32_t, u_offset_t *);
170int32_t ud_iodone(struct buf *);
171int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
172int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
173int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
174int32_t ud_slave_done(struct buf *);
175
176/*
177 * Structures to control multiple IO operations to get or put pages
178 * that are backed by discontiguous blocks. The master struct is
179 * a dummy that holds the original bp from pageio_setup. The
180 * slave struct holds the working bp's to do the actual IO. Once
181 * all the slave IOs complete. The master is processed as if a single
182 * IO op has completed.
183 */
184uint32_t master_index = 0;
185typedef struct mio_master {
186	kmutex_t	mm_mutex;	/* protect the fields below */
187	int32_t		mm_size;
188	buf_t		*mm_bp;		/* original bp */
189	int32_t		mm_resid;	/* bytes remaining to transfer */
190	int32_t		mm_error;	/* accumulated error from slaves */
191	int32_t		mm_index;	/* XXX debugging */
192} mio_master_t;
193
194typedef struct mio_slave {
195	buf_t		ms_buf;		/* working buffer for this IO chunk */
196	mio_master_t	*ms_ptr;	/* pointer to master */
197} mio_slave_t;
198
199struct vnodeops *udf_vnodeops;
200
201const fs_operation_def_t udf_vnodeops_template[] = {
202	VOPNAME_OPEN,		{ .vop_open = udf_open },
203	VOPNAME_CLOSE,		{ .vop_close = udf_close },
204	VOPNAME_READ,		{ .vop_read = udf_read },
205	VOPNAME_WRITE,		{ .vop_write = udf_write },
206	VOPNAME_IOCTL,		{ .vop_ioctl = udf_ioctl },
207	VOPNAME_GETATTR,	{ .vop_getattr = udf_getattr },
208	VOPNAME_SETATTR,	{ .vop_setattr = udf_setattr },
209	VOPNAME_ACCESS,		{ .vop_access = udf_access },
210	VOPNAME_LOOKUP,		{ .vop_lookup = udf_lookup },
211	VOPNAME_CREATE,		{ .vop_create = udf_create },
212	VOPNAME_REMOVE,		{ .vop_remove = udf_remove },
213	VOPNAME_LINK,		{ .vop_link = udf_link },
214	VOPNAME_RENAME,		{ .vop_rename = udf_rename },
215	VOPNAME_MKDIR,		{ .vop_mkdir = udf_mkdir },
216	VOPNAME_RMDIR,		{ .vop_rmdir = udf_rmdir },
217	VOPNAME_READDIR,	{ .vop_readdir = udf_readdir },
218	VOPNAME_SYMLINK,	{ .vop_symlink = udf_symlink },
219	VOPNAME_READLINK,	{ .vop_readlink = udf_readlink },
220	VOPNAME_FSYNC,		{ .vop_fsync = udf_fsync },
221	VOPNAME_INACTIVE,	{ .vop_inactive = udf_inactive },
222	VOPNAME_FID,		{ .vop_fid = udf_fid },
223	VOPNAME_RWLOCK,		{ .vop_rwlock = udf_rwlock },
224	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = udf_rwunlock },
225	VOPNAME_SEEK,		{ .vop_seek = udf_seek },
226	VOPNAME_FRLOCK,		{ .vop_frlock = udf_frlock },
227	VOPNAME_SPACE,		{ .vop_space = udf_space },
228	VOPNAME_GETPAGE,	{ .vop_getpage = udf_getpage },
229	VOPNAME_PUTPAGE,	{ .vop_putpage = udf_putpage },
230	VOPNAME_MAP,		{ .vop_map = udf_map },
231	VOPNAME_ADDMAP,		{ .vop_addmap = udf_addmap },
232	VOPNAME_DELMAP,		{ .vop_delmap = udf_delmap },
233	VOPNAME_PATHCONF,	{ .vop_pathconf = udf_l_pathconf },
234	VOPNAME_PAGEIO,		{ .vop_pageio = udf_pageio },
235	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
236	NULL,			NULL
237};
238
239/* ARGSUSED */
240static int32_t
241udf_open(
242	struct vnode **vpp,
243	int32_t flag,
244	struct cred *cr,
245	caller_context_t *ct)
246{
247	ud_printf("udf_open\n");
248
249	return (0);
250}
251
252/* ARGSUSED */
253static int32_t
254udf_close(
255	struct vnode *vp,
256	int32_t flag,
257	int32_t count,
258	offset_t offset,
259	struct cred *cr,
260	caller_context_t *ct)
261{
262	struct ud_inode *ip = VTOI(vp);
263
264	ud_printf("udf_close\n");
265
266	ITIMES(ip);
267
268	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
269	cleanshares(vp, ttoproc(curthread)->p_pid);
270
271	/*
272	 * Push partially filled cluster at last close.
273	 * ``last close'' is approximated because the dnlc
274	 * may have a hold on the vnode.
275	 */
276	if (vp->v_count <= 2 && vp->v_type != VBAD) {
277		struct ud_inode *ip = VTOI(vp);
278		if (ip->i_delaylen) {
279			(void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
280			    B_ASYNC | B_FREE, cr);
281			ip->i_delaylen = 0;
282		}
283	}
284
285	return (0);
286}
287
288/* ARGSUSED */
289static int32_t
290udf_read(
291	struct vnode *vp,
292	struct uio *uiop,
293	int32_t ioflag,
294	struct cred *cr,
295	caller_context_t *ct)
296{
297	struct ud_inode *ip = VTOI(vp);
298	int32_t error;
299
300	ud_printf("udf_read\n");
301
302#ifdef	__lock_lint
303	rw_enter(&ip->i_rwlock, RW_READER);
304#endif
305
306	ASSERT(RW_READ_HELD(&ip->i_rwlock));
307
308	if (MANDLOCK(vp, ip->i_char)) {
309		/*
310		 * udf_getattr ends up being called by chklock
311		 */
312		error = chklock(vp, FREAD, uiop->uio_loffset,
313		    uiop->uio_resid, uiop->uio_fmode, ct);
314		if (error) {
315			goto end;
316		}
317	}
318
319	rw_enter(&ip->i_contents, RW_READER);
320	error = ud_rdip(ip, uiop, ioflag, cr);
321	rw_exit(&ip->i_contents);
322
323end:
324#ifdef	__lock_lint
325	rw_exit(&ip->i_rwlock);
326#endif
327
328	return (error);
329}
330
331
332int32_t ud_WRITES = 1;
333int32_t ud_HW = 96 * 1024;
334int32_t ud_LW = 64 * 1024;
335int32_t ud_throttles = 0;
336
337/* ARGSUSED */
338static int32_t
339udf_write(
340	struct vnode *vp,
341	struct uio *uiop,
342	int32_t ioflag,
343	struct cred *cr,
344	caller_context_t *ct)
345{
346	struct ud_inode *ip = VTOI(vp);
347	int32_t error = 0;
348
349	ud_printf("udf_write\n");
350
351#ifdef	__lock_lint
352	rw_enter(&ip->i_rwlock, RW_WRITER);
353#endif
354
355	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
356
357	if (MANDLOCK(vp, ip->i_char)) {
358		/*
359		 * ud_getattr ends up being called by chklock
360		 */
361		error = chklock(vp, FWRITE, uiop->uio_loffset,
362		    uiop->uio_resid, uiop->uio_fmode, ct);
363		if (error) {
364			goto end;
365		}
366	}
367	/*
368	 * Throttle writes.
369	 */
370	mutex_enter(&ip->i_tlock);
371	if (ud_WRITES && (ip->i_writes > ud_HW)) {
372		while (ip->i_writes > ud_HW) {
373			ud_throttles++;
374			cv_wait(&ip->i_wrcv, &ip->i_tlock);
375		}
376	}
377	mutex_exit(&ip->i_tlock);
378
379	/*
380	 * Write to the file
381	 */
382	rw_enter(&ip->i_contents, RW_WRITER);
383	if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
384		/*
385		 * In append mode start at end of file.
386		 */
387		uiop->uio_loffset = ip->i_size;
388	}
389	error = ud_wrip(ip, uiop, ioflag, cr);
390	rw_exit(&ip->i_contents);
391
392end:
393#ifdef	__lock_lint
394	rw_exit(&ip->i_rwlock);
395#endif
396
397	return (error);
398}
399
400/* ARGSUSED */
401static int32_t
402udf_ioctl(
403	struct vnode *vp,
404	int32_t cmd,
405	intptr_t arg,
406	int32_t flag,
407	struct cred *cr,
408	int32_t *rvalp,
409	caller_context_t *ct)
410{
411	return (ENOTTY);
412}
413
414/* ARGSUSED */
415static int32_t
416udf_getattr(
417	struct vnode *vp,
418	struct vattr *vap,
419	int32_t flags,
420	struct cred *cr,
421	caller_context_t *ct)
422{
423	struct ud_inode *ip = VTOI(vp);
424
425	ud_printf("udf_getattr\n");
426
427	if (vap->va_mask == AT_SIZE) {
428		/*
429		 * for performance, if only the size is requested don't bother
430		 * with anything else.
431		 */
432		vap->va_size = ip->i_size;
433		return (0);
434	}
435
436	rw_enter(&ip->i_contents, RW_READER);
437
438	vap->va_type = vp->v_type;
439	vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
440
441	vap->va_uid = ip->i_uid;
442	vap->va_gid = ip->i_gid;
443	vap->va_fsid = ip->i_dev;
444	vap->va_nodeid = ip->i_icb_lbano;
445	vap->va_nlink = ip->i_nlink;
446	vap->va_size = ip->i_size;
447	vap->va_seq = ip->i_seq;
448	if (vp->v_type == VCHR || vp->v_type == VBLK) {
449		vap->va_rdev = ip->i_rdev;
450	} else {
451		vap->va_rdev = 0;
452	}
453
454	mutex_enter(&ip->i_tlock);
455	ITIMES_NOLOCK(ip);	/* mark correct time in inode */
456	vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
457	vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
458	vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
459	vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
460	vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
461	vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
462	mutex_exit(&ip->i_tlock);
463
464	switch (ip->i_type) {
465		case VBLK:
466			vap->va_blksize = MAXBSIZE;
467			break;
468		case VCHR:
469			vap->va_blksize = MAXBSIZE;
470			break;
471		default:
472			vap->va_blksize = ip->i_udf->udf_lbsize;
473			break;
474	}
475	vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
476
477	rw_exit(&ip->i_contents);
478
479	return (0);
480}
481
482static int
483ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
484{
485	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
486}
487
488/*ARGSUSED4*/
489static int32_t
490udf_setattr(
491	struct vnode *vp,
492	struct vattr *vap,
493	int32_t flags,
494	struct cred *cr,
495	caller_context_t *ct)
496{
497	int32_t error = 0;
498	uint32_t mask = vap->va_mask;
499	struct ud_inode *ip;
500	timestruc_t now;
501	struct vattr ovap;
502
503	ud_printf("udf_setattr\n");
504
505	ip = VTOI(vp);
506
507	/*
508	 * not updates allowed to 4096 files
509	 */
510	if (ip->i_astrat == STRAT_TYPE4096) {
511		return (EINVAL);
512	}
513
514	/*
515	 * Cannot set these attributes
516	 */
517	if (mask & AT_NOSET) {
518		return (EINVAL);
519	}
520
521	rw_enter(&ip->i_rwlock, RW_WRITER);
522	rw_enter(&ip->i_contents, RW_WRITER);
523
524	ovap.va_uid = ip->i_uid;
525	ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
526	error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
527	    ud_iaccess_vmode, ip);
528	if (error)
529		goto update_inode;
530
531	mask = vap->va_mask;
532	/*
533	 * Change file access modes.
534	 */
535	if (mask & AT_MODE) {
536		ip->i_perm = VA2UD_PERM(vap->va_mode);
537		ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
538		mutex_enter(&ip->i_tlock);
539		ip->i_flag |= ICHG;
540		mutex_exit(&ip->i_tlock);
541	}
542	if (mask & (AT_UID|AT_GID)) {
543		if (mask & AT_UID) {
544			ip->i_uid = vap->va_uid;
545		}
546		if (mask & AT_GID) {
547			ip->i_gid = vap->va_gid;
548		}
549		mutex_enter(&ip->i_tlock);
550		ip->i_flag |= ICHG;
551		mutex_exit(&ip->i_tlock);
552	}
553	/*
554	 * Truncate file.  Must have write permission and not be a directory.
555	 */
556	if (mask & AT_SIZE) {
557		if (vp->v_type == VDIR) {
558			error = EISDIR;
559			goto update_inode;
560		}
561		if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
562			goto update_inode;
563		}
564		if (vap->va_size > MAXOFFSET_T) {
565			error = EFBIG;
566			goto update_inode;
567		}
568		if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
569			goto update_inode;
570		}
571
572		if (vap->va_size == 0)
573			vnevent_truncate(vp, ct);
574	}
575	/*
576	 * Change file access or modified times.
577	 */
578	if (mask & (AT_ATIME|AT_MTIME)) {
579		mutex_enter(&ip->i_tlock);
580		if (mask & AT_ATIME) {
581			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
582			ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
583			ip->i_flag &= ~IACC;
584		}
585		if (mask & AT_MTIME) {
586			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
587			ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
588			gethrestime(&now);
589			ip->i_ctime.tv_sec = now.tv_sec;
590			ip->i_ctime.tv_nsec = now.tv_nsec;
591			ip->i_flag &= ~(IUPD|ICHG);
592			ip->i_flag |= IMODTIME;
593		}
594		ip->i_flag |= IMOD;
595		mutex_exit(&ip->i_tlock);
596	}
597
598update_inode:
599	if (curthread->t_flag & T_DONTPEND) {
600		ud_iupdat(ip, 1);
601	} else {
602		ITIMES_NOLOCK(ip);
603	}
604	rw_exit(&ip->i_contents);
605	rw_exit(&ip->i_rwlock);
606
607	return (error);
608}
609
610/* ARGSUSED */
611static int32_t
612udf_access(
613	struct vnode *vp,
614	int32_t mode,
615	int32_t flags,
616	struct cred *cr,
617	caller_context_t *ct)
618{
619	struct ud_inode *ip = VTOI(vp);
620
621	ud_printf("udf_access\n");
622
623	if (ip->i_udf == NULL) {
624		return (EIO);
625	}
626
627	return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
628}
629
630int32_t udfs_stickyhack = 1;
631
632/* ARGSUSED */
633static int32_t
634udf_lookup(
635	struct vnode *dvp,
636	char *nm,
637	struct vnode **vpp,
638	struct pathname *pnp,
639	int32_t flags,
640	struct vnode *rdir,
641	struct cred *cr,
642	caller_context_t *ct,
643	int *direntflags,
644	pathname_t *realpnp)
645{
646	int32_t error;
647	struct vnode *vp;
648	struct ud_inode *ip, *xip;
649
650	ud_printf("udf_lookup\n");
651	/*
652	 * Null component name is a synonym for directory being searched.
653	 */
654	if (*nm == '\0') {
655		VN_HOLD(dvp);
656		*vpp = dvp;
657		error = 0;
658		goto out;
659	}
660
661	/*
662	 * Fast path: Check the directory name lookup cache.
663	 */
664	ip = VTOI(dvp);
665	if (vp = dnlc_lookup(dvp, nm)) {
666		/*
667		 * Check accessibility of directory.
668		 */
669		if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
670			VN_RELE(vp);
671		}
672		xip = VTOI(vp);
673	} else {
674		error = ud_dirlook(ip, nm, &xip, cr, 1);
675		ITIMES(ip);
676	}
677
678	if (error == 0) {
679		ip = xip;
680		*vpp = ITOV(ip);
681		if ((ip->i_type != VDIR) &&
682		    (ip->i_char & ISVTX) &&
683		    ((ip->i_perm & IEXEC) == 0) &&
684		    udfs_stickyhack) {
685			mutex_enter(&(*vpp)->v_lock);
686			(*vpp)->v_flag |= VISSWAP;
687			mutex_exit(&(*vpp)->v_lock);
688		}
689		ITIMES(ip);
690		/*
691		 * If vnode is a device return special vnode instead.
692		 */
693		if (IS_DEVVP(*vpp)) {
694			struct vnode *newvp;
695			newvp = specvp(*vpp, (*vpp)->v_rdev,
696			    (*vpp)->v_type, cr);
697			VN_RELE(*vpp);
698			if (newvp == NULL) {
699				error = ENOSYS;
700			} else {
701				*vpp = newvp;
702			}
703		}
704	}
705out:
706	return (error);
707}
708
709/* ARGSUSED */
710static int32_t
711udf_create(
712	struct vnode *dvp,
713	char *name,
714	struct vattr *vap,
715	enum vcexcl excl,
716	int32_t mode,
717	struct vnode **vpp,
718	struct cred *cr,
719	int32_t flag,
720	caller_context_t *ct,
721	vsecattr_t *vsecp)
722{
723	int32_t error;
724	struct ud_inode *ip = VTOI(dvp), *xip;
725
726	ud_printf("udf_create\n");
727
728	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
729		vap->va_mode &= ~VSVTX;
730
731	if (*name == '\0') {
732		/*
733		 * Null component name refers to the directory itself.
734		 */
735		VN_HOLD(dvp);
736		ITIMES(ip);
737		error = EEXIST;
738	} else {
739		xip = NULL;
740		rw_enter(&ip->i_rwlock, RW_WRITER);
741		error = ud_direnter(ip, name, DE_CREATE,
742		    (struct ud_inode *)0, (struct ud_inode *)0,
743		    vap, &xip, cr, ct);
744		rw_exit(&ip->i_rwlock);
745		ITIMES(ip);
746		ip = xip;
747	}
748#ifdef	__lock_lint
749	rw_enter(&ip->i_contents, RW_WRITER);
750#else
751	if (ip != NULL) {
752		rw_enter(&ip->i_contents, RW_WRITER);
753	}
754#endif
755
756	/*
757	 * If the file already exists and this is a non-exclusive create,
758	 * check permissions and allow access for non-directories.
759	 * Read-only create of an existing directory is also allowed.
760	 * We fail an exclusive create of anything which already exists.
761	 */
762	if (error == EEXIST) {
763		if (excl == NONEXCL) {
764			if ((ip->i_type == VDIR) && (mode & VWRITE)) {
765				error = EISDIR;
766			} else if (mode) {
767				error = ud_iaccess(ip,
768				    UD_UPERM2DPERM(mode), cr, 0);
769			} else {
770				error = 0;
771			}
772		}
773		if (error) {
774			rw_exit(&ip->i_contents);
775			VN_RELE(ITOV(ip));
776			goto out;
777		} else if ((ip->i_type == VREG) &&
778		    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
779			/*
780			 * Truncate regular files, if requested by caller.
781			 * Grab i_rwlock to make sure no one else is
782			 * currently writing to the file (we promised
783			 * bmap we would do this).
784			 * Must get the locks in the correct order.
785			 */
786			if (ip->i_size == 0) {
787				ip->i_flag |= ICHG | IUPD;
788			} else {
789				rw_exit(&ip->i_contents);
790				rw_enter(&ip->i_rwlock, RW_WRITER);
791				rw_enter(&ip->i_contents, RW_WRITER);
792				(void) ud_itrunc(ip, 0, 0, cr);
793				rw_exit(&ip->i_rwlock);
794			}
795			vnevent_create(ITOV(ip), ct);
796		}
797	}
798
799	if (error == 0) {
800		*vpp = ITOV(ip);
801		ITIMES(ip);
802	}
803#ifdef	__lock_lint
804	rw_exit(&ip->i_contents);
805#else
806	if (ip != NULL) {
807		rw_exit(&ip->i_contents);
808	}
809#endif
810	if (error) {
811		goto out;
812	}
813
814	/*
815	 * If vnode is a device return special vnode instead.
816	 */
817	if (!error && IS_DEVVP(*vpp)) {
818		struct vnode *newvp;
819
820		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
821		VN_RELE(*vpp);
822		if (newvp == NULL) {
823			error = ENOSYS;
824			goto out;
825		}
826		*vpp = newvp;
827	}
828out:
829	return (error);
830}
831
832/* ARGSUSED */
833static int32_t
834udf_remove(
835	struct vnode *vp,
836	char *nm,
837	struct cred *cr,
838	caller_context_t *ct,
839	int flags)
840{
841	int32_t error;
842	struct ud_inode *ip = VTOI(vp);
843
844	ud_printf("udf_remove\n");
845
846	rw_enter(&ip->i_rwlock, RW_WRITER);
847	error = ud_dirremove(ip, nm,
848	    (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
849	rw_exit(&ip->i_rwlock);
850	ITIMES(ip);
851
852	return (error);
853}
854
855/* ARGSUSED */
856static int32_t
857udf_link(
858	struct vnode *tdvp,
859	struct vnode *svp,
860	char *tnm,
861	struct cred *cr,
862	caller_context_t *ct,
863	int flags)
864{
865	int32_t error;
866	struct vnode *realvp;
867	struct ud_inode *sip;
868	struct ud_inode *tdp;
869
870	ud_printf("udf_link\n");
871	if (VOP_REALVP(svp, &realvp, ct) == 0) {
872		svp = realvp;
873	}
874
875	/*
876	 * Do not allow links to directories
877	 */
878	if (svp->v_type == VDIR) {
879		return (EPERM);
880	}
881
882	sip = VTOI(svp);
883
884	if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
885		return (EPERM);
886
887	tdp = VTOI(tdvp);
888
889	rw_enter(&tdp->i_rwlock, RW_WRITER);
890	error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
891	    sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
892	rw_exit(&tdp->i_rwlock);
893	ITIMES(sip);
894	ITIMES(tdp);
895
896	if (error == 0) {
897		vnevent_link(svp, ct);
898	}
899
900	return (error);
901}
902
903/* ARGSUSED */
904static int32_t
905udf_rename(
906	struct vnode *sdvp,
907	char *snm,
908	struct vnode *tdvp,
909	char *tnm,
910	struct cred *cr,
911	caller_context_t *ct,
912	int flags)
913{
914	int32_t error = 0;
915	struct udf_vfs *udf_vfsp;
916	struct ud_inode *sip;		/* source inode */
917	struct ud_inode *tip;		/* target inode */
918	struct ud_inode *sdp, *tdp;	/* source and target parent inode */
919	struct vnode *realvp;
920
921	ud_printf("udf_rename\n");
922
923	if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
924		tdvp = realvp;
925	}
926
927	sdp = VTOI(sdvp);
928	tdp = VTOI(tdvp);
929
930	udf_vfsp = sdp->i_udf;
931
932	mutex_enter(&udf_vfsp->udf_rename_lck);
933	/*
934	 * Look up inode of file we're supposed to rename.
935	 */
936	if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
937		mutex_exit(&udf_vfsp->udf_rename_lck);
938		return (error);
939	}
940	/*
941	 * be sure this is not a directory with another file system mounted
942	 * over it.  If it is just give up the locks, and return with
943	 * EBUSY
944	 */
945	if (vn_mountedvfs(ITOV(sip)) != NULL) {
946		error = EBUSY;
947		goto errout;
948	}
949	/*
950	 * Make sure we can delete the source entry.  This requires
951	 * write permission on the containing directory.  If that
952	 * directory is "sticky" it further requires (except for
953	 * privileged users) that the user own the directory or the
954	 * source entry, or else have permission to write the source
955	 * entry.
956	 */
957	rw_enter(&sdp->i_contents, RW_READER);
958	rw_enter(&sip->i_contents, RW_READER);
959	if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
960	    (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
961		rw_exit(&sip->i_contents);
962		rw_exit(&sdp->i_contents);
963		ITIMES(sip);
964		goto errout;
965	}
966
967	/*
968	 * Check for renaming '.' or '..' or alias of '.'
969	 */
970	if ((strcmp(snm, ".") == 0) ||
971	    (strcmp(snm, "..") == 0) ||
972	    (sdp == sip)) {
973		error = EINVAL;
974		rw_exit(&sip->i_contents);
975		rw_exit(&sdp->i_contents);
976		goto errout;
977	}
978
979	rw_exit(&sip->i_contents);
980	rw_exit(&sdp->i_contents);
981
982	if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
983		vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
984		VN_RELE(ITOV(tip));
985	}
986
987	/* Notify the target dir. if not the same as the source dir. */
988	if (sdvp != tdvp)
989		vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
990
991	vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
992
993	/*
994	 * Link source to the target.
995	 */
996	rw_enter(&tdp->i_rwlock, RW_WRITER);
997	if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
998	    (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
999		/*
1000		 * ESAME isn't really an error; it indicates that the
1001		 * operation should not be done because the source and target
1002		 * are the same file, but that no error should be reported.
1003		 */
1004		if (error == ESAME) {
1005			error = 0;
1006		}
1007		rw_exit(&tdp->i_rwlock);
1008		goto errout;
1009	}
1010	rw_exit(&tdp->i_rwlock);
1011
1012	rw_enter(&sdp->i_rwlock, RW_WRITER);
1013	/*
1014	 * Unlink the source.
1015	 * Remove the source entry.  ud_dirremove() checks that the entry
1016	 * still reflects sip, and returns an error if it doesn't.
1017	 * If the entry has changed just forget about it.  Release
1018	 * the source inode.
1019	 */
1020	if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
1021	    DR_RENAME, cr, ct)) == ENOENT) {
1022		error = 0;
1023	}
1024	rw_exit(&sdp->i_rwlock);
1025
1026	if (error == 0) {
1027		vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1028		/*
1029		 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1030		 * in ud_direnter().
1031		 */
1032	}
1033
1034errout:
1035	ITIMES(sdp);
1036	ITIMES(tdp);
1037	VN_RELE(ITOV(sip));
1038	mutex_exit(&udf_vfsp->udf_rename_lck);
1039
1040	return (error);
1041}
1042
1043/* ARGSUSED */
1044static int32_t
1045udf_mkdir(
1046	struct vnode *dvp,
1047	char *dirname,
1048	struct vattr *vap,
1049	struct vnode **vpp,
1050	struct cred *cr,
1051	caller_context_t *ct,
1052	int flags,
1053	vsecattr_t *vsecp)
1054{
1055	int32_t error;
1056	struct ud_inode *ip;
1057	struct ud_inode *xip;
1058
1059	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1060
1061	ud_printf("udf_mkdir\n");
1062
1063	ip = VTOI(dvp);
1064	rw_enter(&ip->i_rwlock, RW_WRITER);
1065	error = ud_direnter(ip, dirname, DE_MKDIR,
1066	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1067	rw_exit(&ip->i_rwlock);
1068	ITIMES(ip);
1069	if (error == 0) {
1070		ip = xip;
1071		*vpp = ITOV(ip);
1072		ITIMES(ip);
1073	} else if (error == EEXIST) {
1074		ITIMES(xip);
1075		VN_RELE(ITOV(xip));
1076	}
1077
1078	return (error);
1079}
1080
1081/* ARGSUSED */
1082static int32_t
1083udf_rmdir(
1084	struct vnode *vp,
1085	char *nm,
1086	struct vnode *cdir,
1087	struct cred *cr,
1088	caller_context_t *ct,
1089	int flags)
1090{
1091	int32_t error;
1092	struct ud_inode *ip = VTOI(vp);
1093
1094	ud_printf("udf_rmdir\n");
1095
1096	rw_enter(&ip->i_rwlock, RW_WRITER);
1097	error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1098	    cr, ct);
1099	rw_exit(&ip->i_rwlock);
1100	ITIMES(ip);
1101
1102	return (error);
1103}
1104
1105/* ARGSUSED */
1106static int32_t
1107udf_readdir(
1108	struct vnode *vp,
1109	struct uio *uiop,
1110	struct cred *cr,
1111	int32_t *eofp,
1112	caller_context_t *ct,
1113	int flags)
1114{
1115	struct ud_inode *ip;
1116	struct dirent64 *nd;
1117	struct udf_vfs *udf_vfsp;
1118	int32_t error = 0, len, outcount = 0;
1119	uint32_t dirsiz, offset;
1120	uint32_t bufsize, ndlen, dummy;
1121	caddr_t outbuf;
1122	caddr_t outb, end_outb;
1123	struct iovec *iovp;
1124
1125	uint8_t *dname;
1126	int32_t length;
1127
1128	uint8_t *buf = NULL;
1129
1130	struct fbuf *fbp = NULL;
1131	struct file_id *fid;
1132	uint8_t *name;
1133
1134
1135	ud_printf("udf_readdir\n");
1136
1137	ip = VTOI(vp);
1138	udf_vfsp = ip->i_udf;
1139
1140	dirsiz = ip->i_size;
1141	if ((uiop->uio_offset >= dirsiz) ||
1142	    (ip->i_nlink <= 0)) {
1143		if (eofp) {
1144			*eofp = 1;
1145		}
1146		return (0);
1147	}
1148
1149	offset = uiop->uio_offset;
1150	iovp = uiop->uio_iov;
1151	bufsize = iovp->iov_len;
1152
1153	outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1154	end_outb = outb + bufsize;
1155	nd = (struct dirent64 *)outbuf;
1156
1157	dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1158	buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1159
1160	if (offset == 0) {
1161		len = DIRENT64_RECLEN(1);
1162		if (((caddr_t)nd + len) >= end_outb) {
1163			error = EINVAL;
1164			goto end;
1165		}
1166		nd->d_ino = ip->i_icb_lbano;
1167		nd->d_reclen = (uint16_t)len;
1168		nd->d_off = 0x10;
1169		nd->d_name[0] = '.';
1170		bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1171		nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1172		outcount++;
1173	} else if (offset == 0x10) {
1174		offset = 0;
1175	}
1176
1177	while (offset < dirsiz) {
1178		error = ud_get_next_fid(ip, &fbp,
1179		    offset, &fid, &name, buf);
1180		if (error != 0) {
1181			break;
1182		}
1183
1184		if ((fid->fid_flags & FID_DELETED) == 0) {
1185			if (fid->fid_flags & FID_PARENT) {
1186
1187				len = DIRENT64_RECLEN(2);
1188				if (((caddr_t)nd + len) >= end_outb) {
1189					error = EINVAL;
1190					break;
1191				}
1192
1193				nd->d_ino = ip->i_icb_lbano;
1194				nd->d_reclen = (uint16_t)len;
1195				nd->d_off = offset + FID_LEN(fid);
1196				nd->d_name[0] = '.';
1197				nd->d_name[1] = '.';
1198				bzero(&nd->d_name[2],
1199				    DIRENT64_NAMELEN(len) - 2);
1200				nd = (struct dirent64 *)
1201				    ((char *)nd + nd->d_reclen);
1202			} else {
1203				if ((error = ud_uncompress(fid->fid_idlen,
1204				    &length, name, dname)) != 0) {
1205					break;
1206				}
1207				if (length == 0) {
1208					offset += FID_LEN(fid);
1209					continue;
1210				}
1211				len = DIRENT64_RECLEN(length);
1212				if (((caddr_t)nd + len) >= end_outb) {
1213					if (!outcount) {
1214						error = EINVAL;
1215					}
1216					break;
1217				}
1218				(void) strncpy(nd->d_name,
1219				    (caddr_t)dname, length);
1220				bzero(&nd->d_name[length],
1221				    DIRENT64_NAMELEN(len) - length);
1222				nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1223				    SWAP_16(fid->fid_icb.lad_ext_prn),
1224				    SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1225				    &dummy);
1226				nd->d_reclen = (uint16_t)len;
1227				nd->d_off = offset + FID_LEN(fid);
1228				nd = (struct dirent64 *)
1229				    ((char *)nd + nd->d_reclen);
1230			}
1231			outcount++;
1232		}
1233
1234		offset += FID_LEN(fid);
1235	}
1236
1237end:
1238	if (fbp != NULL) {
1239		fbrelse(fbp, S_OTHER);
1240	}
1241	ndlen = ((char *)nd - outbuf);
1242	/*
1243	 * In case of error do not call uiomove.
1244	 * Return the error to the caller.
1245	 */
1246	if ((error == 0) && (ndlen != 0)) {
1247		error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1248		uiop->uio_offset = offset;
1249	}
1250	kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1251	kmem_free((caddr_t)dname, 1024);
1252	kmem_free(outbuf, (uint32_t)bufsize);
1253	if (eofp && error == 0) {
1254		*eofp = (uiop->uio_offset >= dirsiz);
1255	}
1256	return (error);
1257}
1258
1259/* ARGSUSED */
1260static int32_t
1261udf_symlink(
1262	struct vnode *dvp,
1263	char *linkname,
1264	struct vattr *vap,
1265	char *target,
1266	struct cred *cr,
1267	caller_context_t *ct,
1268	int flags)
1269{
1270	int32_t error = 0, outlen;
1271	uint32_t ioflag = 0;
1272	struct ud_inode *ip, *dip = VTOI(dvp);
1273
1274	struct path_comp *pc;
1275	int8_t *dname = NULL, *uname = NULL, *sp;
1276
1277	ud_printf("udf_symlink\n");
1278
1279	ip = (struct ud_inode *)0;
1280	vap->va_type = VLNK;
1281	vap->va_rdev = 0;
1282
1283	rw_enter(&dip->i_rwlock, RW_WRITER);
1284	error = ud_direnter(dip, linkname, DE_CREATE,
1285	    (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1286	rw_exit(&dip->i_rwlock);
1287	if (error == 0) {
1288		dname = kmem_zalloc(1024, KM_SLEEP);
1289		uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1290
1291		pc = (struct path_comp *)uname;
1292		/*
1293		 * If the first character in target is "/"
1294		 * then skip it and create entry for it
1295		 */
1296		if (*target == '/') {
1297			pc->pc_type = 2;
1298			pc->pc_len = 0;
1299			pc = (struct path_comp *)(((char *)pc) + 4);
1300			while (*target == '/') {
1301				target++;
1302			}
1303		}
1304
1305		while (*target != '\0') {
1306			sp = target;
1307			while ((*target != '/') && (*target != '\0')) {
1308				target ++;
1309			}
1310			/*
1311			 * We got the next component of the
1312			 * path name. Create path_comp of
1313			 * appropriate type
1314			 */
1315			if (((target - sp) == 1) && (*sp == '.')) {
1316				/*
1317				 * Dot entry.
1318				 */
1319				pc->pc_type = 4;
1320				pc = (struct path_comp *)(((char *)pc) + 4);
1321			} else if (((target - sp) == 2) &&
1322			    (*sp == '.') && ((*(sp + 1)) == '.')) {
1323				/*
1324				 * DotDot entry.
1325				 */
1326				pc->pc_type = 3;
1327				pc = (struct path_comp *)(((char *)pc) + 4);
1328			} else {
1329				/*
1330				 * convert the user given name
1331				 * into appropriate form to be put
1332				 * on the media
1333				 */
1334				outlen = 1024;	/* set to size of dname */
1335				if (error = ud_compress(target - sp, &outlen,
1336				    (uint8_t *)sp, (uint8_t *)dname)) {
1337					break;
1338				}
1339				pc->pc_type = 5;
1340				/* LINTED */
1341				pc->pc_len = outlen;
1342				dname[outlen] = '\0';
1343				(void) strcpy((char *)pc->pc_id, dname);
1344				pc = (struct path_comp *)
1345				    (((char *)pc) + 4 + outlen);
1346			}
1347			while (*target == '/') {
1348				target++;
1349			}
1350			if (*target == '\0') {
1351				break;
1352			}
1353		}
1354
1355		rw_enter(&ip->i_contents, RW_WRITER);
1356		if (error == 0) {
1357			ioflag = FWRITE;
1358			if (curthread->t_flag & T_DONTPEND) {
1359				ioflag |= FDSYNC;
1360			}
1361			error = ud_rdwri(UIO_WRITE, ioflag, ip,
1362			    uname, ((int8_t *)pc) - uname,
1363			    (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1364		}
1365		if (error) {
1366			ud_idrop(ip);
1367			rw_exit(&ip->i_contents);
1368			rw_enter(&dip->i_rwlock, RW_WRITER);
1369			(void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1370			    (struct vnode *)0, DR_REMOVE, cr, ct);
1371			rw_exit(&dip->i_rwlock);
1372			goto update_inode;
1373		}
1374		rw_exit(&ip->i_contents);
1375	}
1376
1377	if ((error == 0) || (error == EEXIST)) {
1378		VN_RELE(ITOV(ip));
1379	}
1380
1381update_inode:
1382	ITIMES(VTOI(dvp));
1383	if (uname != NULL) {
1384		kmem_free(uname, PAGESIZE);
1385	}
1386	if (dname != NULL) {
1387		kmem_free(dname, 1024);
1388	}
1389
1390	return (error);
1391}
1392
1393/* ARGSUSED */
1394static int32_t
1395udf_readlink(
1396	struct vnode *vp,
1397	struct uio *uiop,
1398	struct cred *cr,
1399	caller_context_t *ct)
1400{
1401	int32_t error = 0, off, id_len, size, len;
1402	int8_t *dname = NULL, *uname = NULL;
1403	struct ud_inode *ip;
1404	struct fbuf *fbp = NULL;
1405	struct path_comp *pc;
1406
1407	ud_printf("udf_readlink\n");
1408
1409	if (vp->v_type != VLNK) {
1410		return (EINVAL);
1411	}
1412
1413	ip = VTOI(vp);
1414	size = ip->i_size;
1415	if (size > PAGESIZE) {
1416		return (EIO);
1417	}
1418
1419	if (size == 0) {
1420		return (0);
1421	}
1422
1423	dname = kmem_zalloc(1024, KM_SLEEP);
1424	uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1425
1426	rw_enter(&ip->i_contents, RW_READER);
1427
1428	if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1429		goto end;
1430	}
1431
1432	off = 0;
1433
1434	while (off < size) {
1435		pc = (struct path_comp *)(fbp->fb_addr + off);
1436		switch (pc->pc_type) {
1437			case 1 :
1438				(void) strcpy(uname, ip->i_udf->udf_fsmnt);
1439				(void) strcat(uname, "/");
1440				break;
1441			case 2 :
1442				if (pc->pc_len != 0) {
1443					goto end;
1444				}
1445				uname[0] = '/';
1446				uname[1] = '\0';
1447				break;
1448			case 3 :
1449				(void) strcat(uname, "../");
1450				break;
1451			case 4 :
1452				(void) strcat(uname, "./");
1453				break;
1454			case 5 :
1455				if ((error = ud_uncompress(pc->pc_len, &id_len,
1456				    pc->pc_id, (uint8_t *)dname)) != 0) {
1457					break;
1458				}
1459				dname[id_len] = '\0';
1460				(void) strcat(uname, dname);
1461				(void) strcat(uname, "/");
1462				break;
1463			default :
1464				error = EINVAL;
1465				goto end;
1466		}
1467		off += 4 + pc->pc_len;
1468	}
1469	len = strlen(uname) - 1;
1470	if (uname[len] == '/') {
1471		if (len == 0) {
1472			/*
1473			 * special case link to /
1474			 */
1475			len = 1;
1476		} else {
1477			uname[len] = '\0';
1478		}
1479	}
1480
1481	error = uiomove(uname, len, UIO_READ, uiop);
1482
1483	ITIMES(ip);
1484
1485end:
1486	if (fbp != NULL) {
1487		fbrelse(fbp, S_OTHER);
1488	}
1489	rw_exit(&ip->i_contents);
1490	if (uname != NULL) {
1491		kmem_free(uname, PAGESIZE);
1492	}
1493	if (dname != NULL) {
1494		kmem_free(dname, 1024);
1495	}
1496	return (error);
1497}
1498
1499/* ARGSUSED */
1500static int32_t
1501udf_fsync(
1502	struct vnode *vp,
1503	int32_t syncflag,
1504	struct cred *cr,
1505	caller_context_t *ct)
1506{
1507	int32_t error = 0;
1508	struct ud_inode *ip = VTOI(vp);
1509
1510	ud_printf("udf_fsync\n");
1511
1512	rw_enter(&ip->i_contents, RW_WRITER);
1513	if (!(IS_SWAPVP(vp))) {
1514		error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1515	}
1516	if (error == 0) {
1517		error = ud_sync_indir(ip);
1518	}
1519	ITIMES(ip);		/* XXX: is this necessary ??? */
1520	rw_exit(&ip->i_contents);
1521
1522	return (error);
1523}
1524
1525/* ARGSUSED */
1526static void
1527udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1528{
1529	ud_printf("udf_iinactive\n");
1530
1531	ud_iinactive(VTOI(vp), cr);
1532}
1533
1534/* ARGSUSED */
1535static int32_t
1536udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1537{
1538	struct udf_fid *udfidp;
1539	struct ud_inode *ip = VTOI(vp);
1540
1541	ud_printf("udf_fid\n");
1542
1543	if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1544		fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1545		return (ENOSPC);
1546	}
1547
1548	udfidp = (struct udf_fid *)fidp;
1549	bzero((char *)udfidp, sizeof (struct udf_fid));
1550	rw_enter(&ip->i_contents, RW_READER);
1551	udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1552	udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1553	udfidp->udfid_prn = ip->i_icb_prn;
1554	udfidp->udfid_icb_lbn = ip->i_icb_block;
1555	rw_exit(&ip->i_contents);
1556
1557	return (0);
1558}
1559
1560/* ARGSUSED2 */
1561static int
1562udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1563{
1564	struct ud_inode *ip = VTOI(vp);
1565
1566	ud_printf("udf_rwlock\n");
1567
1568	if (write_lock) {
1569		rw_enter(&ip->i_rwlock, RW_WRITER);
1570	} else {
1571		rw_enter(&ip->i_rwlock, RW_READER);
1572	}
1573#ifdef	__lock_lint
1574	rw_exit(&ip->i_rwlock);
1575#endif
1576	return (write_lock);
1577}
1578
1579/* ARGSUSED */
1580static void
1581udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1582{
1583	struct ud_inode *ip = VTOI(vp);
1584
1585	ud_printf("udf_rwunlock\n");
1586
1587#ifdef	__lock_lint
1588	rw_enter(&ip->i_rwlock, RW_WRITER);
1589#endif
1590
1591	rw_exit(&ip->i_rwlock);
1592
1593}
1594
1595/* ARGSUSED */
1596static int32_t
1597udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1598{
1599	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1600}
1601
1602static int32_t
1603udf_frlock(
1604	struct vnode *vp,
1605	int32_t cmd,
1606	struct flock64 *bfp,
1607	int32_t flag,
1608	offset_t offset,
1609	struct flk_callback *flk_cbp,
1610	cred_t *cr,
1611	caller_context_t *ct)
1612{
1613	struct ud_inode *ip = VTOI(vp);
1614
1615	ud_printf("udf_frlock\n");
1616
1617	/*
1618	 * If file is being mapped, disallow frlock.
1619	 * XXX I am not holding tlock while checking i_mapcnt because the
1620	 * current locking strategy drops all locks before calling fs_frlock.
1621	 * So, mapcnt could change before we enter fs_frlock making is
1622	 * meaningless to have held tlock in the first place.
1623	 */
1624	if ((ip->i_mapcnt > 0) &&
1625	    (MANDLOCK(vp, ip->i_char))) {
1626		return (EAGAIN);
1627	}
1628
1629	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1630}
1631
1632/*ARGSUSED6*/
1633static int32_t
1634udf_space(
1635	struct vnode *vp,
1636	int32_t cmd,
1637	struct flock64 *bfp,
1638	int32_t flag,
1639	offset_t offset,
1640	cred_t *cr,
1641	caller_context_t *ct)
1642{
1643	int32_t error = 0;
1644
1645	ud_printf("udf_space\n");
1646
1647	if (cmd != F_FREESP) {
1648		error =  EINVAL;
1649	} else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1650		error = ud_freesp(vp, bfp, flag, cr);
1651
1652		if (error == 0 && bfp->l_start == 0)
1653			vnevent_truncate(vp, ct);
1654	}
1655
1656	return (error);
1657}
1658
1659/* ARGSUSED */
1660static int32_t
1661udf_getpage(
1662	struct vnode *vp,
1663	offset_t off,
1664	size_t len,
1665	uint32_t *protp,
1666	struct page **plarr,
1667	size_t plsz,
1668	struct seg *seg,
1669	caddr_t addr,
1670	enum seg_rw rw,
1671	struct cred *cr,
1672	caller_context_t *ct)
1673{
1674	struct ud_inode *ip = VTOI(vp);
1675	int32_t error, has_holes, beyond_eof, seqmode, dolock;
1676	int32_t pgsize = PAGESIZE;
1677	struct udf_vfs *udf_vfsp = ip->i_udf;
1678	page_t **pl;
1679	u_offset_t pgoff, eoff, uoff;
1680	krw_t rwtype;
1681	caddr_t pgaddr;
1682
1683	ud_printf("udf_getpage\n");
1684
1685	uoff = (u_offset_t)off; /* type conversion */
1686	if (protp) {
1687		*protp = PROT_ALL;
1688	}
1689	if (vp->v_flag & VNOMAP) {
1690		return (ENOSYS);
1691	}
1692	seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1693
1694	rwtype = RW_READER;
1695	dolock = (rw_owner(&ip->i_contents) != curthread);
1696retrylock:
1697#ifdef	__lock_lint
1698	rw_enter(&ip->i_contents, rwtype);
1699#else
1700	if (dolock) {
1701		rw_enter(&ip->i_contents, rwtype);
1702	}
1703#endif
1704
1705	/*
1706	 * We may be getting called as a side effect of a bmap using
1707	 * fbread() when the blocks might be being allocated and the
1708	 * size has not yet been up'ed.  In this case we want to be
1709	 * able to return zero pages if we get back UDF_HOLE from
1710	 * calling bmap for a non write case here.  We also might have
1711	 * to read some frags from the disk into a page if we are
1712	 * extending the number of frags for a given lbn in bmap().
1713	 */
1714	beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1715	if (beyond_eof && seg != segkmap) {
1716#ifdef	__lock_lint
1717		rw_exit(&ip->i_contents);
1718#else
1719		if (dolock) {
1720			rw_exit(&ip->i_contents);
1721		}
1722#endif
1723		return (EFAULT);
1724	}
1725
1726	/*
1727	 * Must hold i_contents lock throughout the call to pvn_getpages
1728	 * since locked pages are returned from each call to ud_getapage.
1729	 * Must *not* return locked pages and then try for contents lock
1730	 * due to lock ordering requirements (inode > page)
1731	 */
1732
1733	has_holes = ud_bmap_has_holes(ip);
1734
1735	if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1736		int32_t	blk_size, count;
1737		u_offset_t offset;
1738
1739		/*
1740		 * We must acquire the RW_WRITER lock in order to
1741		 * call bmap_write().
1742		 */
1743		if (dolock && rwtype == RW_READER) {
1744			rwtype = RW_WRITER;
1745
1746			if (!rw_tryupgrade(&ip->i_contents)) {
1747
1748				rw_exit(&ip->i_contents);
1749
1750				goto retrylock;
1751			}
1752		}
1753
1754		/*
1755		 * May be allocating disk blocks for holes here as
1756		 * a result of mmap faults. write(2) does the bmap_write
1757		 * in rdip/wrip, not here. We are not dealing with frags
1758		 * in this case.
1759		 */
1760		offset = uoff;
1761		while ((offset < uoff + len) &&
1762		    (offset < ip->i_size)) {
1763			/*
1764			 * the variable "bnp" is to simplify the expression for
1765			 * the compiler; * just passing in &bn to bmap_write
1766			 * causes a compiler "loop"
1767			 */
1768
1769			blk_size = udf_vfsp->udf_lbsize;
1770			if ((offset + blk_size) > ip->i_size) {
1771				count = ip->i_size - offset;
1772			} else {
1773				count = blk_size;
1774			}
1775			error = ud_bmap_write(ip, offset, count, 0, cr);
1776			if (error) {
1777				goto update_inode;
1778			}
1779			offset += count; /* XXX - make this contig */
1780		}
1781	}
1782
1783	/*
1784	 * Can be a reader from now on.
1785	 */
1786#ifdef	__lock_lint
1787	if (rwtype == RW_WRITER) {
1788		rw_downgrade(&ip->i_contents);
1789	}
1790#else
1791	if (dolock && rwtype == RW_WRITER) {
1792		rw_downgrade(&ip->i_contents);
1793	}
1794#endif
1795
1796	/*
1797	 * We remove PROT_WRITE in cases when the file has UDF holes
1798	 * because we don't  want to call bmap_read() to check each
1799	 * page if it is backed with a disk block.
1800	 */
1801	if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1802		*protp &= ~PROT_WRITE;
1803	}
1804
1805	error = 0;
1806
1807	/*
1808	 * The loop looks up pages in the range <off, off + len).
1809	 * For each page, we first check if we should initiate an asynchronous
1810	 * read ahead before we call page_lookup (we may sleep in page_lookup
1811	 * for a previously initiated disk read).
1812	 */
1813	eoff = (uoff + len);
1814	for (pgoff = uoff, pgaddr = addr, pl = plarr;
1815	    pgoff < eoff; /* empty */) {
1816		page_t	*pp;
1817		u_offset_t	nextrio;
1818		se_t	se;
1819
1820		se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1821
1822		/*
1823		 * Handle async getpage (faultahead)
1824		 */
1825		if (plarr == NULL) {
1826			ip->i_nextrio = pgoff;
1827			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1828			pgoff += pgsize;
1829			pgaddr += pgsize;
1830			continue;
1831		}
1832
1833		/*
1834		 * Check if we should initiate read ahead of next cluster.
1835		 * We call page_exists only when we need to confirm that
1836		 * we have the current page before we initiate the read ahead.
1837		 */
1838		nextrio = ip->i_nextrio;
1839		if (seqmode &&
1840		    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1841		    nextrio < ip->i_size && page_exists(vp, pgoff))
1842			ud_getpage_ra(vp, pgoff, seg, pgaddr);
1843
1844		if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1845
1846			/*
1847			 * We found the page in the page cache.
1848			 */
1849			*pl++ = pp;
1850			pgoff += pgsize;
1851			pgaddr += pgsize;
1852			len -= pgsize;
1853			plsz -= pgsize;
1854		} else  {
1855
1856			/*
1857			 * We have to create the page, or read it from disk.
1858			 */
1859			if (error = ud_getpage_miss(vp, pgoff, len,
1860			    seg, pgaddr, pl, plsz, rw, seqmode)) {
1861				goto error_out;
1862			}
1863
1864			while (*pl != NULL) {
1865				pl++;
1866				pgoff += pgsize;
1867				pgaddr += pgsize;
1868				len -= pgsize;
1869				plsz -= pgsize;
1870			}
1871		}
1872	}
1873
1874	/*
1875	 * Return pages up to plsz if they are in the page cache.
1876	 * We cannot return pages if there is a chance that they are
1877	 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1878	 */
1879	if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1880
1881		ASSERT((protp == NULL) ||
1882		    !(has_holes && (*protp & PROT_WRITE)));
1883
1884		eoff = pgoff + plsz;
1885		while (pgoff < eoff) {
1886			page_t		*pp;
1887
1888			if ((pp = page_lookup_nowait(vp, pgoff,
1889			    SE_SHARED)) == NULL)
1890				break;
1891
1892			*pl++ = pp;
1893			pgoff += pgsize;
1894			plsz -= pgsize;
1895		}
1896	}
1897
1898	if (plarr)
1899		*pl = NULL;			/* Terminate page list */
1900	ip->i_nextr = pgoff;
1901
1902error_out:
1903	if (error && plarr) {
1904		/*
1905		 * Release any pages we have locked.
1906		 */
1907		while (pl > &plarr[0])
1908			page_unlock(*--pl);
1909
1910		plarr[0] = NULL;
1911	}
1912
1913update_inode:
1914#ifdef	__lock_lint
1915	rw_exit(&ip->i_contents);
1916#else
1917	if (dolock) {
1918		rw_exit(&ip->i_contents);
1919	}
1920#endif
1921
1922	/*
1923	 * If the inode is not already marked for IACC (in rwip() for read)
1924	 * and the inode is not marked for no access time update (in rwip()
1925	 * for write) then update the inode access time and mod time now.
1926	 */
1927	mutex_enter(&ip->i_tlock);
1928	if ((ip->i_flag & (IACC | INOACC)) == 0) {
1929		if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1930			ip->i_flag |= IACC;
1931		}
1932		if (rw == S_WRITE) {
1933			ip->i_flag |= IUPD;
1934		}
1935		ITIMES_NOLOCK(ip);
1936	}
1937	mutex_exit(&ip->i_tlock);
1938
1939	return (error);
1940}
1941
1942int32_t ud_delay = 1;
1943
1944/* ARGSUSED */
1945static int32_t
1946udf_putpage(
1947	struct vnode *vp,
1948	offset_t off,
1949	size_t len,
1950	int32_t flags,
1951	struct cred *cr,
1952	caller_context_t *ct)
1953{
1954	struct ud_inode *ip;
1955	int32_t error = 0;
1956
1957	ud_printf("udf_putpage\n");
1958
1959	ip = VTOI(vp);
1960#ifdef	__lock_lint
1961	rw_enter(&ip->i_contents, RW_WRITER);
1962#endif
1963
1964	if (vp->v_count == 0) {
1965		cmn_err(CE_WARN, "ud_putpage : bad v_count");
1966		error = EINVAL;
1967		goto out;
1968	}
1969
1970	if (vp->v_flag & VNOMAP) {
1971		error = ENOSYS;
1972		goto out;
1973	}
1974
1975	if (flags & B_ASYNC) {
1976		if (ud_delay && len &&
1977		    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1978			mutex_enter(&ip->i_tlock);
1979
1980			/*
1981			 * If nobody stalled, start a new cluster.
1982			 */
1983			if (ip->i_delaylen == 0) {
1984				ip->i_delayoff = off;
1985				ip->i_delaylen = len;
1986				mutex_exit(&ip->i_tlock);
1987				goto out;
1988			}
1989
1990			/*
1991			 * If we have a full cluster or they are not contig,
1992			 * then push last cluster and start over.
1993			 */
1994			if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1995			    ip->i_delayoff + ip->i_delaylen != off) {
1996				u_offset_t doff;
1997				size_t dlen;
1998
1999				doff = ip->i_delayoff;
2000				dlen = ip->i_delaylen;
2001				ip->i_delayoff = off;
2002				ip->i_delaylen = len;
2003				mutex_exit(&ip->i_tlock);
2004				error = ud_putpages(vp, doff, dlen, flags, cr);
2005				/* LMXXX - flags are new val, not old */
2006				goto out;
2007			}
2008
2009			/*
2010			 * There is something there, it's not full, and
2011			 * it is contig.
2012			 */
2013			ip->i_delaylen += len;
2014			mutex_exit(&ip->i_tlock);
2015			goto out;
2016		}
2017
2018		/*
2019		 * Must have weird flags or we are not clustering.
2020		 */
2021	}
2022
2023	error = ud_putpages(vp, off, len, flags, cr);
2024
2025out:
2026#ifdef	__lock_lint
2027	rw_exit(&ip->i_contents);
2028#endif
2029	return (error);
2030}
2031
2032/* ARGSUSED */
2033static int32_t
2034udf_map(
2035	struct vnode *vp,
2036	offset_t off,
2037	struct as *as,
2038	caddr_t *addrp,
2039	size_t len,
2040	uint8_t prot,
2041	uint8_t maxprot,
2042	uint32_t flags,
2043	struct cred *cr,
2044	caller_context_t *ct)
2045{
2046	struct segvn_crargs vn_a;
2047	int32_t error = 0;
2048
2049	ud_printf("udf_map\n");
2050
2051	if (vp->v_flag & VNOMAP) {
2052		error = ENOSYS;
2053		goto end;
2054	}
2055
2056	if ((off < (offset_t)0) ||
2057	    ((off + len) < (offset_t)0)) {
2058		error = EINVAL;
2059		goto end;
2060	}
2061
2062	if (vp->v_type != VREG) {
2063		error = ENODEV;
2064		goto end;
2065	}
2066
2067	/*
2068	 * If file is being locked, disallow mapping.
2069	 */
2070	if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2071		error = EAGAIN;
2072		goto end;
2073	}
2074
2075	as_rangelock(as);
2076	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2077	if (error != 0) {
2078		as_rangeunlock(as);
2079		goto end;
2080	}
2081
2082	vn_a.vp = vp;
2083	vn_a.offset = off;
2084	vn_a.type = flags & MAP_TYPE;
2085	vn_a.prot = prot;
2086	vn_a.maxprot = maxprot;
2087	vn_a.cred = cr;
2088	vn_a.amp = NULL;
2089	vn_a.flags = flags & ~MAP_TYPE;
2090	vn_a.szc = 0;
2091	vn_a.lgrp_mem_policy_flags = 0;
2092
2093	error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2094	as_rangeunlock(as);
2095
2096end:
2097	return (error);
2098}
2099
2100/* ARGSUSED */
2101static int32_t
2102udf_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
2103    size_t len, uint8_t prot, uint8_t maxprot, uint32_t flags,
2104    struct cred *cr, caller_context_t *ct)
2105{
2106	struct ud_inode *ip = VTOI(vp);
2107
2108	ud_printf("udf_addmap\n");
2109
2110	if (vp->v_flag & VNOMAP) {
2111		return (ENOSYS);
2112	}
2113
2114	mutex_enter(&ip->i_tlock);
2115	ip->i_mapcnt += btopr(len);
2116	mutex_exit(&ip->i_tlock);
2117
2118	return (0);
2119}
2120
2121/* ARGSUSED */
2122static int32_t
2123udf_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
2124    size_t len, uint32_t prot, uint32_t maxprot, uint32_t flags,
2125    struct cred *cr, caller_context_t *ct)
2126{
2127	struct ud_inode *ip = VTOI(vp);
2128
2129	ud_printf("udf_delmap\n");
2130
2131	if (vp->v_flag & VNOMAP) {
2132		return (ENOSYS);
2133	}
2134
2135	mutex_enter(&ip->i_tlock);
2136	ip->i_mapcnt -= btopr(len);	/* Count released mappings */
2137	ASSERT(ip->i_mapcnt >= 0);
2138	mutex_exit(&ip->i_tlock);
2139
2140	return (0);
2141}
2142
2143/* ARGSUSED */
2144static int32_t
2145udf_l_pathconf(struct vnode *vp, int32_t cmd, ulong_t *valp, struct cred *cr,
2146    caller_context_t *ct)
2147{
2148	int32_t error = 0;
2149
2150	ud_printf("udf_l_pathconf\n");
2151
2152	if (cmd == _PC_FILESIZEBITS) {
2153		/*
2154		 * udf supports 64 bits as file size
2155		 * but there are several other restrictions
2156		 * it only supports 32-bit block numbers and
2157		 * daddr32_t is only and int32_t so taking these
2158		 * into account we can stay just as where ufs is
2159		 */
2160		*valp = 41;
2161	} else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2162		/* nanosecond timestamp resolution */
2163		*valp = 1L;
2164	} else {
2165		error = fs_pathconf(vp, cmd, valp, cr, ct);
2166	}
2167
2168	return (error);
2169}
2170
2171uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2172#ifndef	__lint
2173_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2174_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2175#endif
2176/*
2177 * Assumption is that there will not be a pageio request
2178 * to a enbedded file
2179 */
2180/* ARGSUSED */
2181static int32_t
2182udf_pageio(
2183	struct vnode *vp,
2184	struct page *pp,
2185	u_offset_t io_off,
2186	size_t io_len,
2187	int32_t flags,
2188	struct cred *cr,
2189	caller_context_t *ct)
2190{
2191	daddr_t bn;
2192	struct buf *bp;
2193	struct ud_inode *ip = VTOI(vp);
2194	int32_t dolock, error = 0, contig, multi_io;
2195	size_t done_len = 0, cur_len = 0;
2196	page_t *npp = NULL, *opp = NULL, *cpp = pp;
2197
2198	if (pp == NULL) {
2199		return (EINVAL);
2200	}
2201
2202	dolock = (rw_owner(&ip->i_contents) != curthread);
2203
2204	/*
2205	 * We need a better check.  Ideally, we would use another
2206	 * vnodeops so that hlocked and forcibly unmounted file
2207	 * systems would return EIO where appropriate and w/o the
2208	 * need for these checks.
2209	 */
2210	if (ip->i_udf == NULL) {
2211		return (EIO);
2212	}
2213
2214#ifdef	__lock_lint
2215	rw_enter(&ip->i_contents, RW_READER);
2216#else
2217	if (dolock) {
2218		rw_enter(&ip->i_contents, RW_READER);
2219	}
2220#endif
2221
2222	/*
2223	 * Break the io request into chunks, one for each contiguous
2224	 * stretch of disk blocks in the target file.
2225	 */
2226	while (done_len < io_len) {
2227		ASSERT(cpp);
2228		bp = NULL;
2229		contig = 0;
2230		if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2231		    &bn, &contig)) {
2232			break;
2233		}
2234
2235		if (bn == UDF_HOLE) {   /* No holey swapfiles */
2236			cmn_err(CE_WARN, "SWAP file has HOLES");
2237			error = EINVAL;
2238			break;
2239		}
2240
2241		cur_len = MIN(io_len - done_len, contig);
2242
2243		/*
2244		 * Check if more than one I/O is
2245		 * required to complete the given
2246		 * I/O operation
2247		 */
2248		if (ip->i_udf->udf_lbsize < PAGESIZE) {
2249			if (cur_len >= PAGESIZE) {
2250				multi_io = 0;
2251				cur_len &= PAGEMASK;
2252			} else {
2253				multi_io = 1;
2254				cur_len = MIN(io_len - done_len, PAGESIZE);
2255			}
2256		}
2257		page_list_break(&cpp, &npp, btop(cur_len));
2258
2259		bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2260		ASSERT(bp != NULL);
2261
2262		bp->b_edev = ip->i_dev;
2263		bp->b_dev = cmpdev(ip->i_dev);
2264		bp->b_blkno = bn;
2265		bp->b_un.b_addr = (caddr_t)0;
2266		bp->b_file = vp;
2267		bp->b_offset = (offset_t)(io_off + done_len);
2268
2269/*
2270 *		ub.ub_pageios.value.ul++;
2271 */
2272		if (multi_io == 0) {
2273			(void) bdev_strategy(bp);
2274		} else {
2275			error = ud_multi_strat(ip, cpp, bp,
2276			    (u_offset_t)(io_off + done_len));
2277			if (error != 0) {
2278				pageio_done(bp);
2279				break;
2280			}
2281		}
2282		if (flags & B_READ) {
2283			ud_pageio_reads++;
2284		} else {
2285			ud_pageio_writes++;
2286		}
2287
2288		/*
2289		 * If the request is not B_ASYNC, wait for i/o to complete
2290		 * and re-assemble the page list to return to the caller.
2291		 * If it is B_ASYNC we leave the page list in pieces and
2292		 * cleanup() will dispose of them.
2293		 */
2294		if ((flags & B_ASYNC) == 0) {
2295			error = biowait(bp);
2296			pageio_done(bp);
2297			if (error) {
2298				break;
2299			}
2300			page_list_concat(&opp, &cpp);
2301		}
2302		cpp = npp;
2303		npp = NULL;
2304		done_len += cur_len;
2305	}
2306
2307	ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2308	if (error) {
2309		if (flags & B_ASYNC) {
2310			/* Cleanup unprocessed parts of list */
2311			page_list_concat(&cpp, &npp);
2312			if (flags & B_READ) {
2313				pvn_read_done(cpp, B_ERROR);
2314			} else {
2315				pvn_write_done(cpp, B_ERROR);
2316			}
2317		} else {
2318			/* Re-assemble list and let caller clean up */
2319			page_list_concat(&opp, &cpp);
2320			page_list_concat(&opp, &npp);
2321		}
2322	}
2323
2324#ifdef	__lock_lint
2325	rw_exit(&ip->i_contents);
2326#else
2327	if (dolock) {
2328		rw_exit(&ip->i_contents);
2329	}
2330#endif
2331	return (error);
2332}
2333
2334
2335
2336
2337/* -------------------- local functions --------------------------- */
2338
2339
2340
2341int32_t
2342ud_rdwri(enum uio_rw rw, int32_t ioflag, struct ud_inode *ip, caddr_t base,
2343    int32_t len, offset_t offset, enum uio_seg seg, int32_t *aresid,
2344    struct cred *cr)
2345{
2346	int32_t error;
2347	struct uio auio;
2348	struct iovec aiov;
2349
2350	ud_printf("ud_rdwri\n");
2351
2352	bzero((caddr_t)&auio, sizeof (uio_t));
2353	bzero((caddr_t)&aiov, sizeof (iovec_t));
2354
2355	aiov.iov_base = base;
2356	aiov.iov_len = len;
2357	auio.uio_iov = &aiov;
2358	auio.uio_iovcnt = 1;
2359	auio.uio_loffset = offset;
2360	auio.uio_segflg = (int16_t)seg;
2361	auio.uio_resid = len;
2362
2363	if (rw == UIO_WRITE) {
2364		auio.uio_fmode = FWRITE;
2365		auio.uio_extflg = UIO_COPY_DEFAULT;
2366		auio.uio_llimit = curproc->p_fsz_ctl;
2367		error = ud_wrip(ip, &auio, ioflag, cr);
2368	} else {
2369		auio.uio_fmode = FREAD;
2370		auio.uio_extflg = UIO_COPY_CACHED;
2371		auio.uio_llimit = MAXOFFSET_T;
2372		error = ud_rdip(ip, &auio, ioflag, cr);
2373	}
2374
2375	if (aresid) {
2376		*aresid = auio.uio_resid;
2377	} else if (auio.uio_resid) {
2378		error = EIO;
2379	}
2380	return (error);
2381}
2382
2383/*
2384 * Free behind hacks.  The pager is busted.
2385 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2386 * or B_FREE_IF_TIGHT_ON_MEMORY.
2387 */
2388int32_t ud_freebehind = 1;
2389int32_t ud_smallfile = 32 * 1024;
2390
2391/* ARGSUSED */
2392int32_t
2393ud_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
2394    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int32_t seq)
2395{
2396	struct ud_inode *ip = VTOI(vp);
2397	int32_t err = 0;
2398	size_t io_len;
2399	u_offset_t io_off;
2400	u_offset_t pgoff;
2401	page_t *pp;
2402
2403	pl[0] = NULL;
2404
2405	/*
2406	 * Figure out whether the page can be created, or must be
2407	 * read from the disk
2408	 */
2409	if (rw == S_CREATE) {
2410		if ((pp = page_create_va(vp, off,
2411		    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2412			cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2413			return (EINVAL);
2414		}
2415		io_len = PAGESIZE;
2416	} else {
2417		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2418		    &io_len, off, PAGESIZE, 0);
2419
2420		/*
2421		 * Some other thread has entered the page.
2422		 * ud_getpage will retry page_lookup.
2423		 */
2424		if (pp == NULL) {
2425			return (0);
2426		}
2427
2428		/*
2429		 * Fill the page with as much data as we can from the file.
2430		 */
2431		err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2432		if (err) {
2433			pvn_read_done(pp, B_ERROR);
2434			return (err);
2435		}
2436
2437		/*
2438		 * XXX ??? ufs has io_len instead of pgoff below
2439		 */
2440		ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2441
2442		/*
2443		 * If the file access is sequential, initiate read ahead
2444		 * of the next cluster.
2445		 */
2446		if (seq && ip->i_nextrio < ip->i_size) {
2447			ud_getpage_ra(vp, off, seg, addr);
2448		}
2449	}
2450
2451outmiss:
2452	pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2453	return (err);
2454}
2455
2456/* ARGSUSED */
2457void
2458ud_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
2459{
2460	page_t *pp;
2461	size_t io_len;
2462	struct ud_inode *ip = VTOI(vp);
2463	u_offset_t io_off = ip->i_nextrio, pgoff;
2464	caddr_t addr2 = addr + (io_off - off);
2465	daddr_t bn;
2466	int32_t contig = 0;
2467
2468	/*
2469	 * Is this test needed?
2470	 */
2471
2472	if (addr2 >= seg->s_base + seg->s_size) {
2473		return;
2474	}
2475
2476	contig = 0;
2477	if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2478		return;
2479	}
2480
2481	pp = pvn_read_kluster(vp, io_off, seg, addr2,
2482	    &io_off, &io_len, io_off, PAGESIZE, 1);
2483
2484	/*
2485	 * Some other thread has entered the page.
2486	 * So no read head done here (ie we will have to and wait
2487	 * for the read when needed).
2488	 */
2489
2490	if (pp == NULL) {
2491		return;
2492	}
2493
2494	(void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2495	ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2496}
2497
2498int
2499ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off, uint32_t bflgs,
2500    u_offset_t *pg_off)
2501{
2502	daddr_t bn;
2503	struct buf *bp;
2504	caddr_t kaddr, caddr;
2505	int32_t error = 0, contig = 0, multi_io = 0;
2506	int32_t lbsize = ip->i_udf->udf_lbsize;
2507	int32_t lbmask = ip->i_udf->udf_lbmask;
2508	uint64_t isize;
2509
2510	isize = (ip->i_size + lbmask) & (~lbmask);
2511	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2512
2513		/*
2514		 * Embedded file read file_entry
2515		 * from buffer cache and copy the required
2516		 * portions
2517		 */
2518		bp = ud_bread(ip->i_dev,
2519		    ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2520		if ((bp->b_error == 0) &&
2521		    (bp->b_resid == 0)) {
2522
2523			caddr = bp->b_un.b_addr + ip->i_data_off;
2524
2525			/*
2526			 * mapin to kvm
2527			 */
2528			kaddr = (caddr_t)ppmapin(pp,
2529			    PROT_READ | PROT_WRITE, (caddr_t)-1);
2530			(void) kcopy(caddr, kaddr, ip->i_size);
2531
2532			/*
2533			 * mapout of kvm
2534			 */
2535			ppmapout(kaddr);
2536		}
2537		brelse(bp);
2538		contig = ip->i_size;
2539	} else {
2540
2541		/*
2542		 * Get the continuous size and block number
2543		 * at offset "off"
2544		 */
2545		if (error = ud_bmap_read(ip, off, &bn, &contig))
2546			goto out;
2547		contig = MIN(contig, PAGESIZE);
2548		contig = (contig + lbmask) & (~lbmask);
2549
2550		/*
2551		 * Zero part of the page which we are not
2552		 * going to read from the disk.
2553		 */
2554
2555		if (bn == UDF_HOLE) {
2556
2557			/*
2558			 * This is a HOLE. Just zero out
2559			 * the page
2560			 */
2561			if (((off + contig) == isize) ||
2562			    (contig == PAGESIZE)) {
2563				pagezero(pp->p_prev, 0, PAGESIZE);
2564				goto out;
2565			}
2566		}
2567
2568		if (contig < PAGESIZE) {
2569			uint64_t count;
2570
2571			count = isize - off;
2572			if (contig != count) {
2573				multi_io = 1;
2574				contig = (int32_t)(MIN(count, PAGESIZE));
2575			} else {
2576				pagezero(pp->p_prev, contig, PAGESIZE - contig);
2577			}
2578		}
2579
2580		/*
2581		 * Get a bp and initialize it
2582		 */
2583		bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2584		ASSERT(bp != NULL);
2585
2586		bp->b_edev = ip->i_dev;
2587		bp->b_dev = cmpdev(ip->i_dev);
2588		bp->b_blkno = bn;
2589		bp->b_un.b_addr = 0;
2590		bp->b_file = ip->i_vnode;
2591
2592		/*
2593		 * Start I/O
2594		 */
2595		if (multi_io == 0) {
2596
2597			/*
2598			 * Single I/O is sufficient for this page
2599			 */
2600			(void) bdev_strategy(bp);
2601		} else {
2602
2603			/*
2604			 * We need to do the I/O in
2605			 * piece's
2606			 */
2607			error = ud_multi_strat(ip, pp, bp, off);
2608			if (error != 0) {
2609				goto out;
2610			}
2611		}
2612		if ((bflgs & B_ASYNC) == 0) {
2613
2614			/*
2615			 * Wait for i/o to complete.
2616			 */
2617
2618			error = biowait(bp);
2619			pageio_done(bp);
2620			if (error) {
2621				goto out;
2622			}
2623		}
2624	}
2625	if ((off + contig) >= ip->i_size) {
2626		contig = ip->i_size - off;
2627	}
2628
2629out:
2630	*pg_off = contig;
2631	return (error);
2632}
2633
2634int32_t
2635ud_putpages(struct vnode *vp, offset_t off, size_t len, int32_t flags,
2636    struct cred *cr)
2637{
2638	struct ud_inode *ip;
2639	page_t *pp;
2640	u_offset_t io_off;
2641	size_t io_len;
2642	u_offset_t eoff;
2643	int32_t err = 0;
2644	int32_t dolock;
2645
2646	ud_printf("ud_putpages\n");
2647
2648	if (vp->v_count == 0) {
2649		cmn_err(CE_WARN, "ud_putpages: bad v_count");
2650		return (EINVAL);
2651	}
2652
2653	ip = VTOI(vp);
2654
2655	/*
2656	 * Acquire the readers/write inode lock before locking
2657	 * any pages in this inode.
2658	 * The inode lock is held during i/o.
2659	 */
2660	if (len == 0) {
2661		mutex_enter(&ip->i_tlock);
2662		ip->i_delayoff = ip->i_delaylen = 0;
2663		mutex_exit(&ip->i_tlock);
2664	}
2665#ifdef	__lock_lint
2666	rw_enter(&ip->i_contents, RW_READER);
2667#else
2668	dolock = (rw_owner(&ip->i_contents) != curthread);
2669	if (dolock) {
2670		rw_enter(&ip->i_contents, RW_READER);
2671	}
2672#endif
2673
2674	if (!vn_has_cached_data(vp)) {
2675#ifdef	__lock_lint
2676		rw_exit(&ip->i_contents);
2677#else
2678		if (dolock) {
2679			rw_exit(&ip->i_contents);
2680		}
2681#endif
2682		return (0);
2683	}
2684
2685	if (len == 0) {
2686		/*
2687		 * Search the entire vp list for pages >= off.
2688		 */
2689		err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2690		    flags, cr);
2691	} else {
2692		/*
2693		 * Loop over all offsets in the range looking for
2694		 * pages to deal with.
2695		 */
2696		if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2697			eoff = MIN(off + len, eoff);
2698		} else {
2699			eoff = off + len;
2700		}
2701
2702		for (io_off = off; io_off < eoff; io_off += io_len) {
2703			/*
2704			 * If we are not invalidating, synchronously
2705			 * freeing or writing pages, use the routine
2706			 * page_lookup_nowait() to prevent reclaiming
2707			 * them from the free list.
2708			 */
2709			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2710				pp = page_lookup(vp, io_off,
2711				    (flags & (B_INVAL | B_FREE)) ?
2712				    SE_EXCL : SE_SHARED);
2713			} else {
2714				pp = page_lookup_nowait(vp, io_off,
2715				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2716			}
2717
2718			if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2719				io_len = PAGESIZE;
2720			} else {
2721
2722				err = ud_putapage(vp, pp,
2723				    &io_off, &io_len, flags, cr);
2724				if (err != 0) {
2725					break;
2726				}
2727				/*
2728				 * "io_off" and "io_len" are returned as
2729				 * the range of pages we actually wrote.
2730				 * This allows us to skip ahead more quickly
2731				 * since several pages may've been dealt
2732				 * with by this iteration of the loop.
2733				 */
2734			}
2735		}
2736	}
2737	if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2738		/*
2739		 * We have just sync'ed back all the pages on
2740		 * the inode, turn off the IMODTIME flag.
2741		 */
2742		mutex_enter(&ip->i_tlock);
2743		ip->i_flag &= ~IMODTIME;
2744		mutex_exit(&ip->i_tlock);
2745	}
2746#ifdef	__lock_lint
2747	rw_exit(&ip->i_contents);
2748#else
2749	if (dolock) {
2750		rw_exit(&ip->i_contents);
2751	}
2752#endif
2753	return (err);
2754}
2755
2756/* ARGSUSED */
2757int32_t
2758ud_putapage(struct vnode *vp, page_t *pp, u_offset_t *offp,
2759    size_t *lenp, int32_t flags, struct cred *cr)
2760{
2761	daddr_t bn;
2762	size_t io_len;
2763	struct ud_inode *ip;
2764	int32_t error = 0, contig, multi_io = 0;
2765	struct udf_vfs *udf_vfsp;
2766	u_offset_t off, io_off;
2767	caddr_t kaddr, caddr;
2768	struct buf *bp = NULL;
2769	int32_t lbmask;
2770	uint64_t isize;
2771	uint16_t crc_len;
2772	struct file_entry *fe;
2773
2774	ud_printf("ud_putapage\n");
2775
2776	ip = VTOI(vp);
2777	ASSERT(ip);
2778	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2779	lbmask = ip->i_udf->udf_lbmask;
2780	isize = (ip->i_size + lbmask) & (~lbmask);
2781
2782	udf_vfsp = ip->i_udf;
2783	ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2784
2785	/*
2786	 * If the modified time on the inode has not already been
2787	 * set elsewhere (e.g. for write/setattr) we set the time now.
2788	 * This gives us approximate modified times for mmap'ed files
2789	 * which are modified via stores in the user address space.
2790	 */
2791	if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2792		mutex_enter(&ip->i_tlock);
2793		ip->i_flag |= IUPD;
2794		ITIMES_NOLOCK(ip);
2795		mutex_exit(&ip->i_tlock);
2796	}
2797
2798
2799	/*
2800	 * Align the request to a block boundry (for old file systems),
2801	 * and go ask bmap() how contiguous things are for this file.
2802	 */
2803	off = pp->p_offset & ~(offset_t)lbmask;
2804				/* block align it */
2805
2806
2807	if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2808		ASSERT(ip->i_size <= ip->i_max_emb);
2809
2810		pp = pvn_write_kluster(vp, pp, &io_off,
2811		    &io_len, off, PAGESIZE, flags);
2812		if (io_len == 0) {
2813			io_len = PAGESIZE;
2814		}
2815
2816		bp = ud_bread(ip->i_dev,
2817		    ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2818		    udf_vfsp->udf_lbsize);
2819		fe = (struct file_entry *)bp->b_un.b_addr;
2820		if ((bp->b_flags & B_ERROR) ||
2821		    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2822		    ip->i_icb_block,
2823		    1, udf_vfsp->udf_lbsize) != 0)) {
2824			if (pp != NULL)
2825				pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2826			if (bp->b_flags & B_ERROR) {
2827				error = EIO;
2828			} else {
2829				error = EINVAL;
2830			}
2831			brelse(bp);
2832			return (error);
2833		}
2834		if ((bp->b_error == 0) &&
2835		    (bp->b_resid == 0)) {
2836
2837			caddr = bp->b_un.b_addr + ip->i_data_off;
2838			kaddr = (caddr_t)ppmapin(pp,
2839			    PROT_READ | PROT_WRITE, (caddr_t)-1);
2840			(void) kcopy(kaddr, caddr, ip->i_size);
2841			ppmapout(kaddr);
2842		}
2843		crc_len = offsetof(struct file_entry, fe_spec) +
2844		    SWAP_32(fe->fe_len_ear);
2845		crc_len += ip->i_size;
2846		ud_make_tag(ip->i_udf, &fe->fe_tag,
2847		    UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2848
2849		bwrite(bp);
2850
2851		if (flags & B_ASYNC) {
2852			pvn_write_done(pp, flags);
2853		}
2854		contig = ip->i_size;
2855	} else {
2856
2857		if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2858			goto out;
2859		}
2860		contig = MIN(contig, PAGESIZE);
2861		contig = (contig + lbmask) & (~lbmask);
2862
2863		if (contig < PAGESIZE) {
2864			uint64_t count;
2865
2866			count = isize - off;
2867			if (contig != count) {
2868				multi_io = 1;
2869				contig = (int32_t)(MIN(count, PAGESIZE));
2870			}
2871		}
2872
2873		if ((off + contig) > isize) {
2874			contig = isize - off;
2875		}
2876
2877		if (contig > PAGESIZE) {
2878			if (contig & PAGEOFFSET) {
2879				contig &= PAGEMASK;
2880			}
2881		}
2882
2883		pp = pvn_write_kluster(vp, pp, &io_off,
2884		    &io_len, off, contig, flags);
2885		if (io_len == 0) {
2886			io_len = PAGESIZE;
2887		}
2888
2889		bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2890		ASSERT(bp != NULL);
2891
2892		bp->b_edev = ip->i_dev;
2893		bp->b_dev = cmpdev(ip->i_dev);
2894		bp->b_blkno = bn;
2895		bp->b_un.b_addr = 0;
2896		bp->b_file = vp;
2897		bp->b_offset = (offset_t)off;
2898
2899
2900		/*
2901		 * write throttle
2902		 */
2903		ASSERT(bp->b_iodone == NULL);
2904		bp->b_iodone = ud_iodone;
2905		mutex_enter(&ip->i_tlock);
2906		ip->i_writes += bp->b_bcount;
2907		mutex_exit(&ip->i_tlock);
2908
2909		if (multi_io == 0) {
2910
2911			(void) bdev_strategy(bp);
2912		} else {
2913			error = ud_multi_strat(ip, pp, bp, off);
2914			if (error != 0) {
2915				goto out;
2916			}
2917		}
2918
2919		if ((flags & B_ASYNC) == 0) {
2920			/*
2921			 * Wait for i/o to complete.
2922			 */
2923			error = biowait(bp);
2924			pageio_done(bp);
2925		}
2926	}
2927
2928	if ((flags & B_ASYNC) == 0) {
2929		pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2930	}
2931
2932	pp = NULL;
2933
2934out:
2935	if (error != 0 && pp != NULL) {
2936		pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2937	}
2938
2939	if (offp) {
2940		*offp = io_off;
2941	}
2942	if (lenp) {
2943		*lenp = io_len;
2944	}
2945
2946	return (error);
2947}
2948
2949
2950int32_t
2951ud_iodone(struct buf *bp)
2952{
2953	struct ud_inode *ip;
2954
2955	ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2956
2957	bp->b_iodone = NULL;
2958
2959	ip = VTOI(bp->b_pages->p_vnode);
2960
2961	mutex_enter(&ip->i_tlock);
2962	if (ip->i_writes >= ud_LW) {
2963		if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2964			if (ud_WRITES) {
2965				cv_broadcast(&ip->i_wrcv); /* wake all up */
2966			}
2967		}
2968	} else {
2969		ip->i_writes -= bp->b_bcount;
2970	}
2971	mutex_exit(&ip->i_tlock);
2972	iodone(bp);
2973	return (0);
2974}
2975
2976/* ARGSUSED3 */
2977int32_t
2978ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2979{
2980	struct vnode *vp;
2981	struct udf_vfs *udf_vfsp;
2982	krw_t rwtype;
2983	caddr_t base;
2984	uint32_t flags;
2985	int32_t error, n, on, mapon, dofree;
2986	u_offset_t off;
2987	long oresid = uio->uio_resid;
2988
2989	ASSERT(RW_LOCK_HELD(&ip->i_contents));
2990	if ((ip->i_type != VREG) &&
2991	    (ip->i_type != VDIR) &&
2992	    (ip->i_type != VLNK)) {
2993		return (EIO);
2994	}
2995
2996	if (uio->uio_loffset > MAXOFFSET_T) {
2997		return (0);
2998	}
2999
3000	if ((uio->uio_loffset < (offset_t)0) ||
3001	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
3002		return (EINVAL);
3003	}
3004	if (uio->uio_resid == 0) {
3005		return (0);
3006	}
3007
3008	vp = ITOV(ip);
3009	udf_vfsp = ip->i_udf;
3010	mutex_enter(&ip->i_tlock);
3011	ip->i_flag |= IACC;
3012	mutex_exit(&ip->i_tlock);
3013
3014	rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3015
3016	do {
3017		offset_t diff;
3018		u_offset_t uoff = uio->uio_loffset;
3019		off = uoff & (offset_t)MAXBMASK;
3020		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3021		on = (int)blkoff(udf_vfsp, uoff);
3022		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3023
3024		diff = ip->i_size - uoff;
3025
3026		if (diff <= (offset_t)0) {
3027			error = 0;
3028			goto out;
3029		}
3030		if (diff < (offset_t)n) {
3031			n = (int)diff;
3032		}
3033		dofree = ud_freebehind &&
3034		    ip->i_nextr == (off & PAGEMASK) &&
3035		    off > ud_smallfile;
3036
3037#ifndef	__lock_lint
3038		if (rwtype == RW_READER) {
3039			rw_exit(&ip->i_contents);
3040		}
3041#endif
3042
3043		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3044		    (uint32_t)n, 1, S_READ);
3045		error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3046
3047		flags = 0;
3048		if (!error) {
3049			/*
3050			 * If read a whole block, or read to eof,
3051			 * won't need this buffer again soon.
3052			 */
3053			if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3054			    freemem < lotsfree + pages_before_pager) {
3055				flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3056			}
3057			/*
3058			 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3059			 * we want to make sure that the page which has
3060			 * been read, is written on disk if it is dirty.
3061			 * And corresponding indirect blocks should also
3062			 * be flushed out.
3063			 */
3064			if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3065				flags &= ~SM_ASYNC;
3066				flags |= SM_WRITE;
3067			}
3068			error = segmap_release(segkmap, base, flags);
3069		} else    {
3070			(void) segmap_release(segkmap, base, flags);
3071		}
3072
3073#ifndef __lock_lint
3074		if (rwtype == RW_READER) {
3075			rw_enter(&ip->i_contents, rwtype);
3076		}
3077#endif
3078	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3079out:
3080	/*
3081	 * Inode is updated according to this table if FRSYNC is set.
3082	 *
3083	 *	FSYNC	FDSYNC(posix.4)
3084	 *	--------------------------
3085	 *	always	IATTCHG|IBDWRITE
3086	 */
3087	if (ioflag & FRSYNC) {
3088		if ((ioflag & FSYNC) ||
3089		    ((ioflag & FDSYNC) &&
3090		    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3091		rw_exit(&ip->i_contents);
3092		rw_enter(&ip->i_contents, RW_WRITER);
3093		ud_iupdat(ip, 1);
3094		}
3095	}
3096	/*
3097	 * If we've already done a partial read, terminate
3098	 * the read but return no error.
3099	 */
3100	if (oresid != uio->uio_resid) {
3101		error = 0;
3102	}
3103	ITIMES(ip);
3104
3105	return (error);
3106}
3107
3108int32_t
3109ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3110{
3111	caddr_t base;
3112	struct vnode *vp;
3113	struct udf_vfs *udf_vfsp;
3114	uint32_t flags;
3115	int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3116	int32_t pagecreate, newpage;
3117	uint64_t old_i_size;
3118	u_offset_t off;
3119	long start_resid = uio->uio_resid, premove_resid;
3120	rlim64_t limit = uio->uio_limit;
3121
3122
3123	ASSERT(RW_WRITE_HELD(&ip->i_contents));
3124	if ((ip->i_type != VREG) &&
3125	    (ip->i_type != VDIR) &&
3126	    (ip->i_type != VLNK)) {
3127		return (EIO);
3128	}
3129
3130	if (uio->uio_loffset >= MAXOFFSET_T) {
3131		return (EFBIG);
3132	}
3133	/*
3134	 * see udf_l_pathconf
3135	 */
3136	if (limit > (((uint64_t)1 << 40) - 1)) {
3137		limit = ((uint64_t)1 << 40) - 1;
3138	}
3139	if (uio->uio_loffset >= limit) {
3140		proc_t *p = ttoproc(curthread);
3141
3142		mutex_enter(&p->p_lock);
3143		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3144		    p, RCA_UNSAFE_SIGINFO);
3145		mutex_exit(&p->p_lock);
3146		return (EFBIG);
3147	}
3148	if ((uio->uio_loffset < (offset_t)0) ||
3149	    ((uio->uio_loffset + uio->uio_resid) < 0)) {
3150		return (EINVAL);
3151	}
3152	if (uio->uio_resid == 0) {
3153		return (0);
3154	}
3155
3156	mutex_enter(&ip->i_tlock);
3157	ip->i_flag |= INOACC;
3158
3159	if (ioflag & (FSYNC | FDSYNC)) {
3160		ip->i_flag |= ISYNC;
3161		iupdat_flag = 1;
3162	}
3163	mutex_exit(&ip->i_tlock);
3164
3165	udf_vfsp = ip->i_udf;
3166	vp = ITOV(ip);
3167
3168	do {
3169		u_offset_t uoff = uio->uio_loffset;
3170		off = uoff & (offset_t)MAXBMASK;
3171		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3172		on = (int)blkoff(udf_vfsp, uoff);
3173		n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3174
3175		if (ip->i_type == VREG && uoff + n >= limit) {
3176			if (uoff >= limit) {
3177				error = EFBIG;
3178				goto out;
3179			}
3180			n = (int)(limit - (rlim64_t)uoff);
3181		}
3182		if (uoff + n > ip->i_size) {
3183			/*
3184			 * We are extending the length of the file.
3185			 * bmap is used so that we are sure that
3186			 * if we need to allocate new blocks, that it
3187			 * is done here before we up the file size.
3188			 */
3189			error = ud_bmap_write(ip, uoff,
3190			    (int)(on + n), mapon == 0, cr);
3191			if (error) {
3192				break;
3193			}
3194			i_size_changed = 1;
3195			old_i_size = ip->i_size;
3196			ip->i_size = uoff + n;
3197			/*
3198			 * If we are writing from the beginning of
3199			 * the mapping, we can just create the
3200			 * pages without having to read them.
3201			 */
3202			pagecreate = (mapon == 0);
3203		} else if (n == MAXBSIZE) {
3204			/*
3205			 * Going to do a whole mappings worth,
3206			 * so we can just create the pages w/o
3207			 * having to read them in.  But before
3208			 * we do that, we need to make sure any
3209			 * needed blocks are allocated first.
3210			 */
3211			error = ud_bmap_write(ip, uoff,
3212			    (int)(on + n), 1, cr);
3213			if (error) {
3214				break;
3215			}
3216			pagecreate = 1;
3217		} else {
3218			pagecreate = 0;
3219		}
3220
3221		rw_exit(&ip->i_contents);
3222
3223		/*
3224		 * Touch the page and fault it in if it is not in
3225		 * core before segmap_getmapflt can lock it. This
3226		 * is to avoid the deadlock if the buffer is mapped
3227		 * to the same file through mmap which we want to
3228		 * write to.
3229		 */
3230		uio_prefaultpages((long)n, uio);
3231
3232		base = segmap_getmapflt(segkmap, vp, (off + mapon),
3233		    (uint32_t)n, !pagecreate, S_WRITE);
3234
3235		/*
3236		 * segmap_pagecreate() returns 1 if it calls
3237		 * page_create_va() to allocate any pages.
3238		 */
3239		newpage = 0;
3240		if (pagecreate) {
3241			newpage = segmap_pagecreate(segkmap, base,
3242			    (size_t)n, 0);
3243		}
3244
3245		premove_resid = uio->uio_resid;
3246		error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3247
3248		if (pagecreate &&
3249		    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3250			/*
3251			 * We created pages w/o initializing them completely,
3252			 * thus we need to zero the part that wasn't set up.
3253			 * This happens on most EOF write cases and if
3254			 * we had some sort of error during the uiomove.
3255			 */
3256			int nzero, nmoved;
3257
3258			nmoved = (int)(uio->uio_loffset - (off + mapon));
3259			ASSERT(nmoved >= 0 && nmoved <= n);
3260			nzero = roundup(on + n, PAGESIZE) - nmoved;
3261			ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3262			(void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3263		}
3264
3265		/*
3266		 * Unlock the pages allocated by page_create_va()
3267		 * in segmap_pagecreate()
3268		 */
3269		if (newpage) {
3270			segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3271		}
3272
3273		if (error) {
3274			/*
3275			 * If we failed on a write, we may have already
3276			 * allocated file blocks as well as pages.  It's
3277			 * hard to undo the block allocation, but we must
3278			 * be sure to invalidate any pages that may have
3279			 * been allocated.
3280			 */
3281			(void) segmap_release(segkmap, base, SM_INVAL);
3282		} else {
3283			flags = 0;
3284			/*
3285			 * Force write back for synchronous write cases.
3286			 */
3287			if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3288				/*
3289				 * If the sticky bit is set but the
3290				 * execute bit is not set, we do a
3291				 * synchronous write back and free
3292				 * the page when done.  We set up swap
3293				 * files to be handled this way to
3294				 * prevent servers from keeping around
3295				 * the client's swap pages too long.
3296				 * XXX - there ought to be a better way.
3297				 */
3298				if (IS_SWAPVP(vp)) {
3299					flags = SM_WRITE | SM_FREE |
3300					    SM_DONTNEED;
3301					iupdat_flag = 0;
3302				} else {
3303					flags = SM_WRITE;
3304				}
3305			} else if (((mapon + n) == MAXBSIZE) ||
3306			    IS_SWAPVP(vp)) {
3307				/*
3308				 * Have written a whole block.
3309				 * Start an asynchronous write and
3310				 * mark the buffer to indicate that
3311				 * it won't be needed again soon.
3312				 */
3313				flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3314			}
3315			error = segmap_release(segkmap, base, flags);
3316
3317			/*
3318			 * If the operation failed and is synchronous,
3319			 * then we need to unwind what uiomove() last
3320			 * did so we can potentially return an error to
3321			 * the caller.  If this write operation was
3322			 * done in two pieces and the first succeeded,
3323			 * then we won't return an error for the second
3324			 * piece that failed.  However, we only want to
3325			 * return a resid value that reflects what was
3326			 * really done.
3327			 *
3328			 * Failures for non-synchronous operations can
3329			 * be ignored since the page subsystem will
3330			 * retry the operation until it succeeds or the
3331			 * file system is unmounted.
3332			 */
3333			if (error) {
3334				if ((ioflag & (FSYNC | FDSYNC)) ||
3335				    ip->i_type == VDIR) {
3336					uio->uio_resid = premove_resid;
3337				} else {
3338					error = 0;
3339				}
3340			}
3341		}
3342
3343		/*
3344		 * Re-acquire contents lock.
3345		 */
3346		rw_enter(&ip->i_contents, RW_WRITER);
3347		/*
3348		 * If the uiomove() failed or if a synchronous
3349		 * page push failed, fix up i_size.
3350		 */
3351		if (error) {
3352			if (i_size_changed) {
3353				/*
3354				 * The uiomove failed, and we
3355				 * allocated blocks,so get rid
3356				 * of them.
3357				 */
3358				(void) ud_itrunc(ip, old_i_size, 0, cr);
3359			}
3360		} else {
3361			/*
3362			 * XXX - Can this be out of the loop?
3363			 */
3364			ip->i_flag |= IUPD | ICHG;
3365			if (i_size_changed) {
3366				ip->i_flag |= IATTCHG;
3367			}
3368			if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3369			    (IEXEC >> 10))) != 0 &&
3370			    (ip->i_char & (ISUID | ISGID)) != 0 &&
3371			    secpolicy_vnode_setid_retain(cr,
3372			    (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3373				/*
3374				 * Clear Set-UID & Set-GID bits on
3375				 * successful write if not privileged
3376				 * and at least one of the execute bits
3377				 * is set.  If we always clear Set-GID,
3378				 * mandatory file and record locking is
3379				 * unuseable.
3380				 */
3381				ip->i_char &= ~(ISUID | ISGID);
3382			}
3383		}
3384	} while (error == 0 && uio->uio_resid > 0 && n != 0);
3385
3386out:
3387	/*
3388	 * Inode is updated according to this table -
3389	 *
3390	 *	FSYNC	FDSYNC(posix.4)
3391	 *	--------------------------
3392	 *	always@	IATTCHG|IBDWRITE
3393	 *
3394	 * @ -  If we are doing synchronous write the only time we should
3395	 *	not be sync'ing the ip here is if we have the stickyhack
3396	 *	activated, the file is marked with the sticky bit and
3397	 *	no exec bit, the file length has not been changed and
3398	 *	no new blocks have been allocated during this write.
3399	 */
3400	if ((ip->i_flag & ISYNC) != 0) {
3401		/*
3402		 * we have eliminated nosync
3403		 */
3404		if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3405		    ((ioflag & FSYNC) && iupdat_flag)) {
3406			ud_iupdat(ip, 1);
3407		}
3408	}
3409
3410	/*
3411	 * If we've already done a partial-write, terminate
3412	 * the write but return no error.
3413	 */
3414	if (start_resid != uio->uio_resid) {
3415		error = 0;
3416	}
3417	ip->i_flag &= ~(INOACC | ISYNC);
3418	ITIMES_NOLOCK(ip);
3419
3420	return (error);
3421}
3422
3423int32_t
3424ud_multi_strat(struct ud_inode *ip,
3425    page_t *pp, struct buf *bp, u_offset_t start)
3426{
3427	daddr_t bn;
3428	int32_t error = 0, io_count, contig, alloc_sz, i;
3429	uint32_t io_off;
3430	mio_master_t *mm = NULL;
3431	mio_slave_t *ms = NULL;
3432	struct buf *rbp;
3433
3434	ASSERT(!(start & PAGEOFFSET));
3435
3436	/*
3437	 * Figure out how many buffers to allocate
3438	 */
3439	io_count = 0;
3440	for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3441		contig = 0;
3442		if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3443		    &bn, &contig)) {
3444			goto end;
3445		}
3446		if (contig == 0) {
3447			goto end;
3448		}
3449		contig = MIN(contig, PAGESIZE - io_off);
3450		if (bn != UDF_HOLE) {
3451			io_count ++;
3452		} else {
3453			/*
3454			 * HOLE
3455			 */
3456			if (bp->b_flags & B_READ) {
3457
3458				/*
3459				 * This is a hole and is read
3460				 * it should be filled with 0's
3461				 */
3462				pagezero(pp, io_off, contig);
3463			}
3464		}
3465	}
3466
3467
3468	if (io_count != 0) {
3469
3470		/*
3471		 * Allocate memory for all the
3472		 * required number of buffers
3473		 */
3474		alloc_sz = sizeof (mio_master_t) +
3475		    (sizeof (mio_slave_t) * io_count);
3476		mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3477		if (mm == NULL) {
3478			error = ENOMEM;
3479			goto end;
3480		}
3481
3482		/*
3483		 * initialize master
3484		 */
3485		mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3486		mm->mm_size = alloc_sz;
3487		mm->mm_bp = bp;
3488		mm->mm_resid = 0;
3489		mm->mm_error = 0;
3490		mm->mm_index = master_index++;
3491
3492		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3493
3494		/*
3495		 * Initialize buffers
3496		 */
3497		io_count = 0;
3498		for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3499			contig = 0;
3500			if (error = ud_bmap_read(ip,
3501			    (u_offset_t)(start + io_off),
3502			    &bn, &contig)) {
3503				goto end;
3504			}
3505			ASSERT(contig);
3506			if ((io_off + contig) > bp->b_bcount) {
3507				contig = bp->b_bcount - io_off;
3508			}
3509			if (bn != UDF_HOLE) {
3510				/*
3511				 * Clone the buffer
3512				 * and prepare to start I/O
3513				 */
3514				ms->ms_ptr = mm;
3515				bioinit(&ms->ms_buf);
3516				rbp = bioclone(bp, io_off, (size_t)contig,
3517				    bp->b_edev, bn, ud_slave_done,
3518				    &ms->ms_buf, KM_NOSLEEP);
3519				ASSERT(rbp == &ms->ms_buf);
3520				mm->mm_resid += contig;
3521				io_count++;
3522				ms ++;
3523			}
3524		}
3525
3526		/*
3527		 * Start I/O's
3528		 */
3529		ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3530		for (i = 0; i < io_count; i++) {
3531			(void) bdev_strategy(&ms->ms_buf);
3532			ms ++;
3533		}
3534	}
3535
3536end:
3537	if (error != 0) {
3538		bp->b_flags |= B_ERROR;
3539		bp->b_error = error;
3540		if (mm != NULL) {
3541			mutex_destroy(&mm->mm_mutex);
3542			kmem_free(mm, mm->mm_size);
3543		}
3544	}
3545	return (error);
3546}
3547
3548int32_t
3549ud_slave_done(struct buf *bp)
3550{
3551	mio_master_t *mm;
3552	int32_t resid;
3553
3554	ASSERT(SEMA_HELD(&bp->b_sem));
3555	ASSERT((bp->b_flags & B_DONE) == 0);
3556
3557	mm = ((mio_slave_t *)bp)->ms_ptr;
3558
3559	/*
3560	 * Propagate error and byte count info from slave struct to
3561	 * the master struct
3562	 */
3563	mutex_enter(&mm->mm_mutex);
3564	if (bp->b_flags & B_ERROR) {
3565
3566		/*
3567		 * If multiple slave buffers get
3568		 * error we forget the old errors
3569		 * this is ok because we any way
3570		 * cannot return multiple errors
3571		 */
3572		mm->mm_error = bp->b_error;
3573	}
3574	mm->mm_resid -= bp->b_bcount;
3575	resid = mm->mm_resid;
3576	mutex_exit(&mm->mm_mutex);
3577
3578	/*
3579	 * free up the resources allocated to cloned buffers.
3580	 */
3581	bp_mapout(bp);
3582	biofini(bp);
3583
3584	if (resid == 0) {
3585
3586		/*
3587		 * This is the last I/O operation
3588		 * clean up and return the original buffer
3589		 */
3590		if (mm->mm_error) {
3591			mm->mm_bp->b_flags |= B_ERROR;
3592			mm->mm_bp->b_error = mm->mm_error;
3593		}
3594		biodone(mm->mm_bp);
3595		mutex_destroy(&mm->mm_mutex);
3596		kmem_free(mm, mm->mm_size);
3597	}
3598	return (0);
3599}
3600