1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27/* All Rights Reserved */
28
29
30#include <nfs/nfs4_clnt.h>
31#include <nfs/rnode4.h>
32#include <sys/systm.h>
33#include <sys/cmn_err.h>
34#include <sys/atomic.h>
35
36static void	nfs4_free_open_owner(nfs4_open_owner_t *, mntinfo4_t *);
37static nfs4_open_owner_t *find_freed_open_owner(cred_t *,
38				nfs4_oo_hash_bucket_t *, mntinfo4_t *);
39static open_delegation_type4 get_dtype(rnode4_t *);
40
41#ifdef DEBUG
42int nfs4_client_foo_debug = 0x0;
43int nfs4_client_open_dg = 0x0;
44/*
45 * If this is non-zero, the lockowner and openowner seqid sync primitives
46 * will intermittently return errors.
47 */
48static int seqid_sync_faults = 0;
49#endif
50
51stateid4 clnt_special0 = {
52	0,
53	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
54};
55
56stateid4 clnt_special1 = {
57	0xffffffff,
58	{
59		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
60		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
61		(char)0xff, (char)0xff, (char)0xff, (char)0xff
62	}
63};
64
65/* finds hash bucket and locks it */
66static nfs4_oo_hash_bucket_t *
67lock_bucket(cred_t *cr, mntinfo4_t *mi)
68{
69	nfs4_oo_hash_bucket_t *bucketp;
70	uint32_t hash_key;
71
72	hash_key = (uint32_t)(crgetuid(cr) + crgetruid(cr))
73	    % NFS4_NUM_OO_BUCKETS;
74	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "lock_bucket: "
75	    "hash_key %d for cred %p", hash_key, (void*)cr));
76
77	ASSERT(hash_key >= 0 && hash_key < NFS4_NUM_OO_BUCKETS);
78	ASSERT(mi != NULL);
79	ASSERT(mutex_owned(&mi->mi_lock));
80
81	bucketp = &(mi->mi_oo_list[hash_key]);
82	mutex_enter(&bucketp->b_lock);
83	return (bucketp);
84}
85
86/* unlocks hash bucket pointed by bucket_ptr */
87static void
88unlock_bucket(nfs4_oo_hash_bucket_t *bucketp)
89{
90	mutex_exit(&bucketp->b_lock);
91}
92
93/*
94 * Removes the lock owner from the rnode's lock_owners list and frees the
95 * corresponding reference.
96 */
97void
98nfs4_rnode_remove_lock_owner(rnode4_t *rp, nfs4_lock_owner_t *lop)
99{
100	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
101	    "nfs4_rnode_remove_lock_owner"));
102
103	mutex_enter(&rp->r_statev4_lock);
104
105	if (lop->lo_next_rnode == NULL) {
106		/* already removed from list */
107		mutex_exit(&rp->r_statev4_lock);
108		return;
109	}
110
111	ASSERT(lop->lo_prev_rnode != NULL);
112
113	lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
114	lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
115
116	lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
117
118	mutex_exit(&rp->r_statev4_lock);
119
120	/*
121	 * This would be an appropriate place for
122	 * RELEASE_LOCKOWNER.  For now, this is overkill
123	 * because in the common case, close is going to
124	 * release any lockowners anyway.
125	 */
126	lock_owner_rele(lop);
127}
128
129/*
130 * Remove all lock owners from the rnode's lock_owners list.  Frees up
131 * their references from the list.
132 */
133
134void
135nfs4_flush_lock_owners(rnode4_t *rp)
136{
137	nfs4_lock_owner_t *lop;
138
139	mutex_enter(&rp->r_statev4_lock);
140	while (rp->r_lo_head.lo_next_rnode != &rp->r_lo_head) {
141		lop = rp->r_lo_head.lo_next_rnode;
142		lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
143		lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
144		lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
145		lock_owner_rele(lop);
146	}
147	mutex_exit(&rp->r_statev4_lock);
148}
149
150void
151nfs4_clear_open_streams(rnode4_t *rp)
152{
153	nfs4_open_stream_t *osp;
154
155	mutex_enter(&rp->r_os_lock);
156	while ((osp = list_head(&rp->r_open_streams)) != NULL) {
157		open_owner_rele(osp->os_open_owner);
158		list_remove(&rp->r_open_streams, osp);
159		mutex_destroy(&osp->os_sync_lock);
160		osp->os_open_owner = NULL;
161		kmem_free(osp, sizeof (*osp));
162	}
163	mutex_exit(&rp->r_os_lock);
164}
165
166void
167open_owner_hold(nfs4_open_owner_t *oop)
168{
169	mutex_enter(&oop->oo_lock);
170	oop->oo_ref_count++;
171	mutex_exit(&oop->oo_lock);
172}
173
174/*
175 * Frees the open owner if the ref count hits zero.
176 */
177void
178open_owner_rele(nfs4_open_owner_t *oop)
179{
180	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
181	    "open_owner_rele"));
182
183	mutex_enter(&oop->oo_lock);
184	oop->oo_ref_count--;
185	if (oop->oo_ref_count == 0) {
186		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
187		    "open_owner_rele: freeing open owner"));
188		oop->oo_valid = 0;
189		mutex_exit(&oop->oo_lock);
190		/*
191		 * Ok, we don't destroy the open owner, nor do we put it on
192		 * the mntinfo4's free list just yet.  We are lazy about it
193		 * and let callers to find_open_owner() do that to keep locking
194		 * simple.
195		 */
196	} else {
197		mutex_exit(&oop->oo_lock);
198	}
199}
200
201void
202open_stream_hold(nfs4_open_stream_t *osp)
203{
204	mutex_enter(&osp->os_sync_lock);
205	osp->os_ref_count++;
206	mutex_exit(&osp->os_sync_lock);
207}
208
209/*
210 * Frees the open stream and removes it from the rnode4's open streams list if
211 * the ref count drops to zero.
212 */
213void
214open_stream_rele(nfs4_open_stream_t *osp, rnode4_t *rp)
215{
216	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
217	    "open_stream_rele"));
218
219	ASSERT(!mutex_owned(&rp->r_os_lock));
220
221	mutex_enter(&osp->os_sync_lock);
222	ASSERT(osp->os_ref_count > 0);
223	osp->os_ref_count--;
224	if (osp->os_ref_count == 0) {
225		nfs4_open_owner_t *tmp_oop;
226
227		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
228		    "open_stream_rele: freeing open stream"));
229		osp->os_valid = 0;
230		tmp_oop = osp->os_open_owner;
231		mutex_exit(&osp->os_sync_lock);
232
233		/* now see if we need to destroy the open owner */
234		open_owner_rele(tmp_oop);
235
236		mutex_enter(&rp->r_os_lock);
237		list_remove(&rp->r_open_streams, osp);
238		mutex_exit(&rp->r_os_lock);
239
240		/* free up osp */
241		mutex_destroy(&osp->os_sync_lock);
242		osp->os_open_owner = NULL;
243		kmem_free(osp, sizeof (*osp));
244	} else {
245		mutex_exit(&osp->os_sync_lock);
246	}
247}
248
249void
250lock_owner_hold(nfs4_lock_owner_t *lop)
251{
252	mutex_enter(&lop->lo_lock);
253	lop->lo_ref_count++;
254	mutex_exit(&lop->lo_lock);
255}
256
257/*
258 * Frees the lock owner if the ref count hits zero and
259 * the structure no longer has no locks.
260 */
261void
262lock_owner_rele(nfs4_lock_owner_t *lop)
263{
264	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
265	    "lock_owner_rele"));
266
267	mutex_enter(&lop->lo_lock);
268	lop->lo_ref_count--;
269	if (lop->lo_ref_count == 0) {
270		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
271		    "lock_owner_rele: freeing lock owner: "
272		    "%x", lop->lo_pid));
273		lop->lo_valid = 0;
274		/*
275		 * If there are no references, the lock_owner should
276		 * already be off the rnode's list.
277		 */
278		ASSERT(lop->lo_next_rnode == NULL);
279		ASSERT(lop->lo_prev_rnode == NULL);
280		ASSERT(!(lop->lo_flags & NFS4_LOCK_SEQID_INUSE));
281		ASSERT(lop->lo_seqid_holder == NULL);
282		mutex_exit(&lop->lo_lock);
283
284		/* free up lop */
285		cv_destroy(&lop->lo_cv_seqid_sync);
286		mutex_destroy(&lop->lo_lock);
287		kmem_free(lop, sizeof (*lop));
288	} else {
289		mutex_exit(&lop->lo_lock);
290	}
291}
292
293/*
294 * This increments the open owner ref count if found.
295 * The argument 'just_created' determines whether we are looking for open
296 * owners with the 'oo_just_created' flag set or not.
297 */
298nfs4_open_owner_t *
299find_open_owner_nolock(cred_t *cr, int just_created, mntinfo4_t *mi)
300{
301	nfs4_open_owner_t	*oop = NULL, *next_oop;
302	nfs4_oo_hash_bucket_t	*bucketp;
303
304	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
305	    "find_open_owner: cred %p, just_created %d",
306	    (void*)cr, just_created));
307
308	ASSERT(mi != NULL);
309	ASSERT(mutex_owned(&mi->mi_lock));
310
311	bucketp = lock_bucket(cr, mi);
312
313	/* got hash bucket, search through open owners */
314	for (oop = list_head(&bucketp->b_oo_hash_list); oop != NULL; ) {
315		mutex_enter(&oop->oo_lock);
316		if (!crcmp(oop->oo_cred, cr) &&
317		    (oop->oo_just_created == just_created ||
318		    just_created == NFS4_JUST_CREATED)) {
319			/* match */
320			if (oop->oo_valid == 0) {
321				/* reactivate the open owner */
322				oop->oo_valid = 1;
323				ASSERT(oop->oo_ref_count == 0);
324			}
325			oop->oo_ref_count++;
326			mutex_exit(&oop->oo_lock);
327			unlock_bucket(bucketp);
328			return (oop);
329		}
330		next_oop = list_next(&bucketp->b_oo_hash_list, oop);
331		if (oop->oo_valid == 0) {
332			list_remove(&bucketp->b_oo_hash_list, oop);
333
334			/*
335			 * Now we go ahead and put this open owner
336			 * on the freed list.  This is our lazy method.
337			 */
338			nfs4_free_open_owner(oop, mi);
339		}
340
341		mutex_exit(&oop->oo_lock);
342		oop = next_oop;
343	}
344
345	/* search through recently freed open owners */
346	oop = find_freed_open_owner(cr, bucketp, mi);
347
348	unlock_bucket(bucketp);
349
350	return (oop);
351}
352
353nfs4_open_owner_t *
354find_open_owner(cred_t *cr, int just_created, mntinfo4_t *mi)
355{
356	nfs4_open_owner_t *oop;
357
358	mutex_enter(&mi->mi_lock);
359	oop = find_open_owner_nolock(cr, just_created, mi);
360	mutex_exit(&mi->mi_lock);
361
362	return (oop);
363}
364
365/*
366 * This increments osp's ref count if found.
367 * Returns with 'os_sync_lock' held.
368 */
369nfs4_open_stream_t *
370find_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
371{
372	nfs4_open_stream_t	*osp;
373
374	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
375	    "find_open_stream"));
376
377	mutex_enter(&rp->r_os_lock);
378	/* Now, no one can add or delete to rp's open streams list */
379	for (osp = list_head(&rp->r_open_streams); osp != NULL;
380	    osp = list_next(&rp->r_open_streams, osp)) {
381		mutex_enter(&osp->os_sync_lock);
382		if (osp->os_open_owner == oop && osp->os_valid != 0) {
383			/* match */
384			NFS4_DEBUG(nfs4_client_state_debug,
385			    (CE_NOTE, "find_open_stream "
386			    "got a match"));
387
388			osp->os_ref_count++;
389			mutex_exit(&rp->r_os_lock);
390			return (osp);
391		}
392		mutex_exit(&osp->os_sync_lock);
393	}
394
395	mutex_exit(&rp->r_os_lock);
396	return (NULL);
397}
398
399/*
400 * Find the lock owner for the given file and process ID.  If "which" is
401 * LOWN_VALID_STATEID, require that the lock owner contain a valid stateid
402 * from the server.
403 *
404 * This increments the lock owner's ref count if found.  Returns NULL if
405 * there was no match.
406 */
407nfs4_lock_owner_t *
408find_lock_owner(rnode4_t *rp, pid_t pid, lown_which_t which)
409{
410	nfs4_lock_owner_t	*lop, *next_lop;
411
412	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
413	    "find_lock_owner: pid %x, which %d", pid, which));
414
415	ASSERT(which == LOWN_ANY || which == LOWN_VALID_STATEID);
416
417	/* search by pid */
418	mutex_enter(&rp->r_statev4_lock);
419
420	lop = rp->r_lo_head.lo_next_rnode;
421	while (lop != &rp->r_lo_head) {
422		mutex_enter(&lop->lo_lock);
423		if (lop->lo_pid == pid && lop->lo_valid != 0 &&
424		    !(lop->lo_flags & NFS4_BAD_SEQID_LOCK)) {
425			if (which == LOWN_ANY ||
426			    lop->lo_just_created != NFS4_JUST_CREATED) {
427				/* Found a matching lock owner */
428				NFS4_DEBUG(nfs4_client_state_debug,
429				    (CE_NOTE, "find_lock_owner: "
430				    "got a match"));
431
432				lop->lo_ref_count++;
433				mutex_exit(&lop->lo_lock);
434				mutex_exit(&rp->r_statev4_lock);
435				return (lop);
436			}
437		}
438		next_lop = lop->lo_next_rnode;
439		mutex_exit(&lop->lo_lock);
440		lop = next_lop;
441	}
442
443	mutex_exit(&rp->r_statev4_lock);
444	return (NULL);
445}
446
447/*
448 * This returns the delegation stateid as 'sid'. Returns 1 if a successful
449 * delegation stateid was found, otherwise returns 0.
450 */
451
452static int
453nfs4_get_deleg_stateid(rnode4_t *rp, nfs_opnum4 op, stateid4 *sid)
454{
455	ASSERT(!mutex_owned(&rp->r_statev4_lock));
456
457	mutex_enter(&rp->r_statev4_lock);
458	if (((rp->r_deleg_type == OPEN_DELEGATE_WRITE && op == OP_WRITE) ||
459	    (rp->r_deleg_type != OPEN_DELEGATE_NONE && op != OP_WRITE)) &&
460	    !rp->r_deleg_return_pending) {
461
462		*sid = rp->r_deleg_stateid;
463		mutex_exit(&rp->r_statev4_lock);
464		return (1);
465	}
466	mutex_exit(&rp->r_statev4_lock);
467	return (0);
468}
469
470/*
471 * This returns the lock stateid as 'sid'. Returns 1 if a successful lock
472 * stateid was found, otherwise returns 0.
473 */
474static int
475nfs4_get_lock_stateid(rnode4_t *rp, pid_t pid, stateid4 *sid)
476{
477	nfs4_lock_owner_t *lop;
478
479	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
480
481	if (lop) {
482		/*
483		 * Found a matching lock owner, so use a lock
484		 * stateid rather than an open stateid.
485		 */
486		mutex_enter(&lop->lo_lock);
487		*sid = lop->lock_stateid;
488		mutex_exit(&lop->lo_lock);
489		lock_owner_rele(lop);
490		return (1);
491	}
492
493	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
494	    "nfs4_get_lock_stateid: no lop"));
495	return (0);
496}
497
498/*
499 * This returns the open stateid as 'sid'. Returns 1 if a successful open
500 * stateid was found, otherwise returns 0.
501 *
502 * Once the stateid is returned to the caller, it is no longer protected;
503 * so the caller must be prepared to handle OLD/BAD_STATEID where
504 * appropiate.
505 */
506static int
507nfs4_get_open_stateid(rnode4_t *rp, cred_t *cr, mntinfo4_t *mi, stateid4 *sid)
508{
509	nfs4_open_owner_t *oop;
510	nfs4_open_stream_t *osp;
511
512	ASSERT(mi != NULL);
513
514	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
515	if (!oop) {
516		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
517		    "nfs4_get_open_stateid: no oop"));
518		return (0);
519	}
520
521	osp = find_open_stream(oop, rp);
522	open_owner_rele(oop);
523	if (!osp) {
524		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
525		    "nfs4_get_open_stateid: no osp"));
526		return (0);
527	}
528
529	if (osp->os_failed_reopen) {
530		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
531		    "nfs4_get_open_stateid: osp %p failed reopen",
532		    (void *)osp));
533		mutex_exit(&osp->os_sync_lock);
534		open_stream_rele(osp, rp);
535		return (0);
536	}
537	*sid = osp->open_stateid;
538	mutex_exit(&osp->os_sync_lock);
539	open_stream_rele(osp, rp);
540	return (1);
541}
542
543/*
544 * Returns the delegation stateid if this 'op' is OP_WRITE and the
545 * delegation we hold is a write delegation, OR this 'op' is not
546 * OP_WRITE and we have a delegation held (read or write), otherwise
547 * returns the lock stateid if there is a lock owner, otherwise
548 * returns the open stateid if there is a open stream, otherwise
549 * returns special stateid <seqid = 0, other = 0>.
550 *
551 * Used for WRITE operations.
552 */
553stateid4
554nfs4_get_w_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
555    nfs_opnum4 op, nfs4_stateid_types_t *sid_tp)
556{
557	stateid4 sid;
558
559	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
560		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
561			sid_tp->cur_sid_type = DEL_SID;
562			return (sid);
563		}
564	}
565	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
566		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
567			sid_tp->cur_sid_type = LOCK_SID;
568			return (sid);
569		}
570	}
571	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
572		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
573			sid_tp->cur_sid_type = OPEN_SID;
574			return (sid);
575		}
576	}
577	bzero(&sid, sizeof (stateid4));
578	sid_tp->cur_sid_type = SPEC_SID;
579	return (sid);
580}
581
582/*
583 * Returns the delegation stateid if this 'op' is OP_WRITE and the
584 * delegation we hold is a write delegation, OR this 'op' is not
585 * OP_WRITE and we have a delegation held (read or write), otherwise
586 * returns the lock stateid if there is a lock owner, otherwise
587 * returns the open stateid if there is a open stream, otherwise
588 * returns special stateid <seqid = 0, other = 0>.
589 *
590 * This also updates which stateid we are using in 'sid_tp', skips
591 * previously attempted stateids, and skips checking higher priority
592 * stateids than the current level as dictated by 'sid_tp->cur_sid_type'
593 * for async reads.
594 *
595 * Used for READ and SETATTR operations.
596 */
597stateid4
598nfs4_get_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
599    nfs_opnum4 op, nfs4_stateid_types_t *sid_tp, bool_t async_read)
600{
601	stateid4 sid;
602
603	/*
604	 * For asynchronous READs, do not attempt to retry from the start of
605	 * the stateid priority list, just continue from where you last left
606	 * off.
607	 */
608	if (async_read) {
609		switch (sid_tp->cur_sid_type) {
610		case NO_SID:
611			break;
612		case DEL_SID:
613			goto lock_stateid;
614		case LOCK_SID:
615			goto open_stateid;
616		case OPEN_SID:
617			goto special_stateid;
618		case SPEC_SID:
619		default:
620			cmn_err(CE_PANIC, "nfs4_get_stateid: illegal current "
621			    "stateid type %d", sid_tp->cur_sid_type);
622		}
623	}
624
625	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
626		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
627			sid_tp->cur_sid_type = DEL_SID;
628			return (sid);
629		}
630	}
631lock_stateid:
632	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
633		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
634			sid_tp->cur_sid_type = LOCK_SID;
635			return (sid);
636		}
637	}
638open_stateid:
639	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
640		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
641			sid_tp->cur_sid_type = OPEN_SID;
642			return (sid);
643		}
644	}
645special_stateid:
646	bzero(&sid, sizeof (stateid4));
647	sid_tp->cur_sid_type = SPEC_SID;
648	return	(sid);
649}
650
651void
652nfs4_set_lock_stateid(nfs4_lock_owner_t *lop, stateid4 stateid)
653{
654	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
655	    "nfs4_set_lock_stateid"));
656
657	ASSERT(lop);
658	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
659
660	mutex_enter(&lop->lo_lock);
661	lop->lock_stateid = stateid;
662	mutex_exit(&lop->lo_lock);
663}
664
665/*
666 * Sequence number used when a new open owner is needed.
667 * This is used so as to not confuse the server.  Since a open owner
668 * is based off of cred, a cred could be re-used quickly, and the server
669 * may not release all state for a cred.
670 */
671static uint64_t open_owner_seq_num = 0;
672
673uint64_t
674nfs4_get_new_oo_name(void)
675{
676	return (atomic_inc_64_nv(&open_owner_seq_num));
677}
678
679/*
680 * Create a new open owner and add it to the open owner hash table.
681 */
682nfs4_open_owner_t *
683create_open_owner(cred_t *cr, mntinfo4_t *mi)
684{
685	nfs4_open_owner_t	*oop;
686	nfs4_oo_hash_bucket_t	*bucketp;
687
688	oop = kmem_alloc(sizeof (nfs4_open_owner_t), KM_SLEEP);
689	/*
690	 * Make sure the cred doesn't go away when we put this open owner
691	 * on the free list, as well as make crcmp() a valid check.
692	 */
693	crhold(cr);
694	oop->oo_cred = cr;
695	mutex_init(&oop->oo_lock, NULL, MUTEX_DEFAULT, NULL);
696	oop->oo_ref_count = 1;
697	oop->oo_valid = 1;
698	oop->oo_just_created = NFS4_JUST_CREATED;
699	oop->oo_seqid = 0;
700	oop->oo_seqid_inuse = 0;
701	oop->oo_last_good_seqid = 0;
702	oop->oo_last_good_op = TAG_NONE;
703	oop->oo_cred_otw = NULL;
704	cv_init(&oop->oo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
705
706	/*
707	 * A Solaris open_owner is <oo_seq_num>
708	 */
709	oop->oo_name = nfs4_get_new_oo_name();
710
711	/* now add the struct into the cred hash table */
712	ASSERT(mutex_owned(&mi->mi_lock));
713	bucketp = lock_bucket(cr, mi);
714	list_insert_head(&bucketp->b_oo_hash_list, oop);
715	unlock_bucket(bucketp);
716
717	return (oop);
718}
719
720/*
721 * Create a new open stream and it to the rnode's list.
722 * Increments the ref count on oop.
723 * Returns with 'os_sync_lock' held.
724 */
725nfs4_open_stream_t *
726create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
727{
728	nfs4_open_stream_t	*osp;
729
730#ifdef DEBUG
731	mutex_enter(&oop->oo_lock);
732	ASSERT(oop->oo_seqid_inuse);
733	mutex_exit(&oop->oo_lock);
734#endif
735
736	osp = kmem_alloc(sizeof (nfs4_open_stream_t), KM_SLEEP);
737	osp->os_open_ref_count = 1;
738	osp->os_mapcnt = 0;
739	osp->os_ref_count = 2;
740	osp->os_valid = 1;
741	osp->os_open_owner = oop;
742	osp->os_orig_oo_name = oop->oo_name;
743	bzero(&osp->open_stateid, sizeof (stateid4));
744	osp->os_share_acc_read = 0;
745	osp->os_share_acc_write = 0;
746	osp->os_mmap_read = 0;
747	osp->os_mmap_write = 0;
748	osp->os_share_deny_none = 0;
749	osp->os_share_deny_read = 0;
750	osp->os_share_deny_write = 0;
751	osp->os_delegation = 0;
752	osp->os_dc_openacc = 0;
753	osp->os_final_close = 0;
754	osp->os_pending_close = 0;
755	osp->os_failed_reopen = 0;
756	osp->os_force_close = 0;
757	mutex_init(&osp->os_sync_lock, NULL, MUTEX_DEFAULT, NULL);
758
759	/* open owner gets a reference */
760	open_owner_hold(oop);
761
762	/* now add the open stream to rp */
763	mutex_enter(&rp->r_os_lock);
764	mutex_enter(&osp->os_sync_lock);
765	list_insert_head(&rp->r_open_streams, osp);
766	mutex_exit(&rp->r_os_lock);
767
768	return (osp);
769}
770
771/*
772 * Returns an open stream with 'os_sync_lock' held.
773 * If the open stream is found (rather than created), its
774 * 'os_open_ref_count' is bumped.
775 *
776 * There is no race with two threads entering this function
777 * and creating two open streams for the same <oop, rp> pair.
778 * This is because the open seqid sync must be acquired, thus
779 * only allowing one thread in at a time.
780 */
781nfs4_open_stream_t *
782find_or_create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp,
783    int *created_osp)
784{
785	nfs4_open_stream_t *osp;
786
787#ifdef DEBUG
788	mutex_enter(&oop->oo_lock);
789	ASSERT(oop->oo_seqid_inuse);
790	mutex_exit(&oop->oo_lock);
791#endif
792
793	osp = find_open_stream(oop, rp);
794	if (!osp) {
795		osp = create_open_stream(oop, rp);
796		if (osp)
797			*created_osp = 1;
798	} else {
799		*created_osp = 0;
800		osp->os_open_ref_count++;
801	}
802
803	return (osp);
804}
805
806static uint64_t lock_owner_seq_num = 0;
807
808/*
809 * Create a new lock owner and add it to the rnode's list.
810 * Assumes the rnode's r_statev4_lock is held.
811 * The created lock owner has a reference count of 2: one for the list and
812 * one for the caller to use.  Returns the lock owner locked down.
813 */
814nfs4_lock_owner_t *
815create_lock_owner(rnode4_t *rp, pid_t pid)
816{
817	nfs4_lock_owner_t	*lop;
818
819	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
820	    "create_lock_owner: pid %x", pid));
821
822	ASSERT(mutex_owned(&rp->r_statev4_lock));
823
824	lop = kmem_alloc(sizeof (nfs4_lock_owner_t), KM_SLEEP);
825	lop->lo_ref_count = 2;
826	lop->lo_valid = 1;
827	bzero(&lop->lock_stateid, sizeof (stateid4));
828	lop->lo_pid = pid;
829	lop->lock_seqid = 0;
830	lop->lo_pending_rqsts = 0;
831	lop->lo_just_created = NFS4_JUST_CREATED;
832	lop->lo_flags = 0;
833	lop->lo_seqid_holder = NULL;
834
835	/*
836	 * A Solaris lock_owner is <seq_num><pid>
837	 */
838	lop->lock_owner_name.ln_seq_num =
839	    atomic_inc_64_nv(&lock_owner_seq_num);
840	lop->lock_owner_name.ln_pid = pid;
841
842	cv_init(&lop->lo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
843	mutex_init(&lop->lo_lock, NULL, MUTEX_DEFAULT, NULL);
844
845	mutex_enter(&lop->lo_lock);
846
847	/* now add the lock owner to rp */
848	lop->lo_prev_rnode = &rp->r_lo_head;
849	lop->lo_next_rnode = rp->r_lo_head.lo_next_rnode;
850	rp->r_lo_head.lo_next_rnode->lo_prev_rnode = lop;
851	rp->r_lo_head.lo_next_rnode = lop;
852
853	return (lop);
854
855}
856
857/*
858 * This sets the lock seqid of a lock owner.
859 */
860void
861nfs4_set_lock_seqid(seqid4 seqid, nfs4_lock_owner_t *lop)
862{
863	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
864	    "nfs4_set_lock_seqid"));
865
866	ASSERT(lop != NULL);
867	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
868
869	lop->lock_seqid = seqid;
870}
871
872static void
873nfs4_set_new_lock_owner_args(lock_owner4 *owner, pid_t pid)
874{
875	nfs4_lo_name_t *cast_namep;
876
877	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
878	    "nfs4_set_new_lock_owner_args"));
879
880	owner->owner_len = sizeof (*cast_namep);
881	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
882	/*
883	 * A Solaris lock_owner is <seq_num><pid>
884	 */
885	cast_namep = (nfs4_lo_name_t *)owner->owner_val;
886	cast_namep->ln_seq_num = atomic_inc_64_nv(&lock_owner_seq_num);
887	cast_namep->ln_pid = pid;
888}
889
890/*
891 * Fill in the lock owner args.
892 */
893void
894nfs4_setlockowner_args(lock_owner4 *owner, rnode4_t *rp, pid_t pid)
895{
896	nfs4_lock_owner_t *lop;
897
898	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
899	    "nfs4_setlockowner_args"));
900
901	/* This increments lop's ref count */
902	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
903
904	if (!lop)
905		goto make_up_args;
906
907	mutex_enter(&lop->lo_lock);
908	owner->owner_len = sizeof (lop->lock_owner_name);
909	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
910	bcopy(&lop->lock_owner_name, owner->owner_val,
911	    owner->owner_len);
912	mutex_exit(&lop->lo_lock);
913	lock_owner_rele(lop);
914	return;
915
916make_up_args:
917	nfs4_set_new_lock_owner_args(owner, pid);
918}
919
920/*
921 * This ends our use of the open owner's open seqid by setting
922 * the appropiate flags and issuing a cv_signal to wake up another
923 * thread waiting to use the open seqid.
924 */
925
926void
927nfs4_end_open_seqid_sync(nfs4_open_owner_t *oop)
928{
929	mutex_enter(&oop->oo_lock);
930	ASSERT(oop->oo_seqid_inuse);
931	oop->oo_seqid_inuse = 0;
932	cv_signal(&oop->oo_cv_seqid_sync);
933	mutex_exit(&oop->oo_lock);
934}
935
936/*
937 * This starts our use of the open owner's open seqid by setting
938 * the oo_seqid_inuse to true.  We will wait (forever) with a
939 * cv_wait() until we are woken up.
940 *
941 * Return values:
942 * 0		no problems
943 * EAGAIN	caller should retry (like a recovery retry)
944 */
945int
946nfs4_start_open_seqid_sync(nfs4_open_owner_t *oop, mntinfo4_t *mi)
947{
948	int error = 0;
949#ifdef DEBUG
950	static int ops = 0;		/* fault injection */
951#endif
952
953#ifdef DEBUG
954	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
955	    ++ops % 5 == 0)
956		return (EAGAIN);
957#endif
958
959	mutex_enter(&mi->mi_lock);
960	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
961	    curthread != mi->mi_recovthread)
962		error = EAGAIN;
963	mutex_exit(&mi->mi_lock);
964	if (error != 0)
965		goto done;
966
967	mutex_enter(&oop->oo_lock);
968
969	while (oop->oo_seqid_inuse) {
970		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
971		    "nfs4_start_open_seqid_sync waiting on cv"));
972
973		cv_wait(&oop->oo_cv_seqid_sync, &oop->oo_lock);
974	}
975
976	oop->oo_seqid_inuse = 1;
977
978	mutex_exit(&oop->oo_lock);
979
980	mutex_enter(&mi->mi_lock);
981	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
982	    curthread != mi->mi_recovthread)
983		error = EAGAIN;
984	mutex_exit(&mi->mi_lock);
985
986	if (error == EAGAIN)
987		nfs4_end_open_seqid_sync(oop);
988
989	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
990	    "nfs4_start_open_seqid_sync: error=%d", error));
991
992done:
993	return (error);
994}
995
996#ifdef	DEBUG
997int bypass_otw[2];
998#endif
999
1000/*
1001 * Checks to see if the OPEN OTW is necessary that is, if it's already
1002 * been opened with the same access and deny bits we are now asking for.
1003 * Note, this assumes that *vp is a rnode.
1004 */
1005int
1006nfs4_is_otw_open_necessary(nfs4_open_owner_t *oop, int flag, vnode_t *vp,
1007    int just_been_created, int *errorp, int acc, nfs4_recov_state_t *rsp)
1008{
1009	rnode4_t *rp;
1010	nfs4_open_stream_t *osp;
1011	open_delegation_type4 dt;
1012
1013	rp = VTOR4(vp);
1014
1015	/*
1016	 * Grab the delegation type.  This function is protected against
1017	 * the delegation being returned by virtue of start_op (called
1018	 * by nfs4open_otw) taking the r_deleg_recall_lock in read mode,
1019	 * delegreturn requires this lock in write mode to proceed.
1020	 */
1021	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_READER));
1022	dt = get_dtype(rp);
1023
1024	/* returns with 'os_sync_lock' held */
1025	osp = find_open_stream(oop, rp);
1026
1027	if (osp) {
1028		uint32_t	do_otw = 0;
1029
1030		if (osp->os_failed_reopen) {
1031			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
1032			    "nfs4_is_otw_open_necessary: os_failed_reopen "
1033			    "set on osp %p, cr %p, rp %s", (void *)osp,
1034			    (void *)osp->os_open_owner->oo_cred,
1035			    rnode4info(rp)));
1036			do_otw = 1;
1037		}
1038
1039		/*
1040		 * check access/deny bits
1041		 */
1042		if (!do_otw && (flag & FREAD))
1043			if (osp->os_share_acc_read == 0 &&
1044			    dt == OPEN_DELEGATE_NONE)
1045				do_otw = 1;
1046
1047		if (!do_otw && (flag & FWRITE))
1048			if (osp->os_share_acc_write == 0 &&
1049			    dt != OPEN_DELEGATE_WRITE)
1050				do_otw = 1;
1051
1052		if (!do_otw) {
1053			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1054			    "nfs4_is_otw_open_necessary: can skip this "
1055			    "open OTW"));
1056			if (!just_been_created) {
1057				osp->os_open_ref_count++;
1058				if (flag & FREAD)
1059					osp->os_share_acc_read++;
1060				if (flag & FWRITE)
1061					osp->os_share_acc_write++;
1062				osp->os_share_deny_none++;
1063			}
1064
1065			/*
1066			 * Need to reset this bitfield for the possible case
1067			 * where we were going to OTW CLOSE the file, got a
1068			 * non-recoverable error, and before we could retry
1069			 * the CLOSE, OPENed the file again.
1070			 */
1071			ASSERT(osp->os_open_owner->oo_seqid_inuse);
1072			osp->os_final_close = 0;
1073			osp->os_force_close = 0;
1074
1075			mutex_exit(&osp->os_sync_lock);
1076			open_stream_rele(osp, rp);
1077
1078#ifdef	DEBUG
1079			bypass_otw[0]++;
1080#endif
1081
1082			*errorp = 0;
1083			return (0);
1084		}
1085		mutex_exit(&osp->os_sync_lock);
1086		open_stream_rele(osp, rp);
1087
1088	} else if (dt != OPEN_DELEGATE_NONE) {
1089		/*
1090		 * Even if there isn't an open_stream yet, we may still be
1091		 * able to bypass the otw open if the client owns a delegation.
1092		 *
1093		 * If you are asking for for WRITE, but I only have
1094		 * a read delegation, then you still have to go otw.
1095		 */
1096
1097		if (flag & FWRITE && dt == OPEN_DELEGATE_READ)
1098			return (1);
1099
1100		/*
1101		 * TODO - evaluate the nfsace4
1102		 */
1103
1104		/*
1105		 * Check the access flags to make sure the caller
1106		 * had permission.
1107		 */
1108		if (flag & FREAD && !(acc & VREAD))
1109			return (1);
1110
1111		if (flag & FWRITE && !(acc & VWRITE))
1112			return (1);
1113
1114		/*
1115		 * create_open_stream will add a reference to oop,
1116		 * this will prevent the open_owner_rele done in
1117		 * nfs4open_otw from destroying the open_owner.
1118		 */
1119
1120		/* returns with 'os_sync_lock' held */
1121		osp = create_open_stream(oop, rp);
1122		if (osp == NULL)
1123			return (1);
1124
1125		osp->open_stateid = rp->r_deleg_stateid;
1126		osp->os_delegation = 1;
1127
1128		if (flag & FREAD)
1129			osp->os_share_acc_read++;
1130		if (flag & FWRITE)
1131			osp->os_share_acc_write++;
1132
1133		osp->os_share_deny_none++;
1134		mutex_exit(&osp->os_sync_lock);
1135
1136		open_stream_rele(osp, rp);
1137
1138		mutex_enter(&oop->oo_lock);
1139		oop->oo_just_created = NFS4_PERM_CREATED;
1140		mutex_exit(&oop->oo_lock);
1141
1142		ASSERT(rsp != NULL);
1143		if (rsp->rs_sp != NULL) {
1144			mutex_enter(&rsp->rs_sp->s_lock);
1145			nfs4_inc_state_ref_count_nolock(rsp->rs_sp,
1146			    VTOMI4(vp));
1147			mutex_exit(&rsp->rs_sp->s_lock);
1148		}
1149#ifdef	DEBUG
1150		bypass_otw[1]++;
1151#endif
1152
1153		*errorp = 0;
1154		return (0);
1155	}
1156
1157	return (1);
1158}
1159
1160static open_delegation_type4
1161get_dtype(rnode4_t *rp)
1162{
1163	open_delegation_type4 dt;
1164
1165	mutex_enter(&rp->r_statev4_lock);
1166	ASSERT(!rp->r_deleg_return_inprog);
1167	if (rp->r_deleg_return_pending)
1168		dt = OPEN_DELEGATE_NONE;
1169	else
1170		dt = rp->r_deleg_type;
1171	mutex_exit(&rp->r_statev4_lock);
1172
1173	return (dt);
1174}
1175
1176/*
1177 * Fill in *locker with the lock state arguments for a LOCK call.  If
1178 * lop->lo_just_created == NFS4_JUST_CREATED, oop and osp must be non-NULL.
1179 * Caller must already hold the necessary seqid sync lock(s).
1180 */
1181
1182void
1183nfs4_setup_lock_args(nfs4_lock_owner_t *lop, nfs4_open_owner_t *oop,
1184    nfs4_open_stream_t *osp, clientid4 clientid, locker4 *locker)
1185{
1186	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1187	if (lop->lo_just_created == NFS4_JUST_CREATED) {
1188		/* this is a new lock request */
1189		open_to_lock_owner4 *nown;
1190
1191		ASSERT(oop != NULL);
1192		ASSERT(osp != NULL);
1193
1194		locker->new_lock_owner = TRUE;
1195		nown = &locker->locker4_u.open_owner;
1196		nown->open_seqid = nfs4_get_open_seqid(oop) + 1;
1197		mutex_enter(&osp->os_sync_lock);
1198		nown->open_stateid = osp->open_stateid;
1199		mutex_exit(&osp->os_sync_lock);
1200		nown->lock_seqid = lop->lock_seqid; /* initial, so no +1 */
1201
1202		nown->lock_owner.clientid = clientid;
1203		nown->lock_owner.owner_len = sizeof (lop->lock_owner_name);
1204		nown->lock_owner.owner_val =
1205		    kmem_alloc(nown->lock_owner.owner_len, KM_SLEEP);
1206		bcopy(&lop->lock_owner_name, nown->lock_owner.owner_val,
1207		    nown->lock_owner.owner_len);
1208	} else {
1209		exist_lock_owner4 *eown;
1210		/* have an existing lock owner */
1211
1212		locker->new_lock_owner = FALSE;
1213		eown = &locker->locker4_u.lock_owner;
1214		mutex_enter(&lop->lo_lock);
1215		eown->lock_stateid = lop->lock_stateid;
1216		mutex_exit(&lop->lo_lock);
1217		eown->lock_seqid = lop->lock_seqid + 1;
1218	}
1219}
1220
1221/*
1222 * This starts our use of the lock owner's lock seqid by setting
1223 * the lo_flags to NFS4_LOCK_SEQID_INUSE.  We will wait (forever)
1224 * with a cv_wait() until we are woken up.
1225 *
1226 * Return values:
1227 * 0		no problems
1228 * EAGAIN	caller should retry (like a recovery retry)
1229 */
1230int
1231nfs4_start_lock_seqid_sync(nfs4_lock_owner_t *lop, mntinfo4_t *mi)
1232{
1233	int error = 0;
1234#ifdef DEBUG
1235	static int ops = 0;		/* fault injection */
1236#endif
1237
1238#ifdef DEBUG
1239	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
1240	    ++ops % 7 == 0)
1241		return (EAGAIN);
1242#endif
1243
1244	mutex_enter(&mi->mi_lock);
1245	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1246	    curthread != mi->mi_recovthread)
1247		error = EAGAIN;
1248	mutex_exit(&mi->mi_lock);
1249	if (error != 0)
1250		goto done;
1251
1252	mutex_enter(&lop->lo_lock);
1253
1254	ASSERT(lop->lo_seqid_holder != curthread);
1255	while (lop->lo_flags & NFS4_LOCK_SEQID_INUSE) {
1256		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1257		    "nfs4_start_lock_seqid_sync: waiting on cv"));
1258
1259		cv_wait(&lop->lo_cv_seqid_sync, &lop->lo_lock);
1260	}
1261	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4_start_lock_seqid_sync: "
1262	    "NFS4_LOCK_SEQID_INUSE"));
1263
1264	lop->lo_flags |= NFS4_LOCK_SEQID_INUSE;
1265	lop->lo_seqid_holder = curthread;
1266	mutex_exit(&lop->lo_lock);
1267
1268	mutex_enter(&mi->mi_lock);
1269	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1270	    curthread != mi->mi_recovthread)
1271		error = EAGAIN;
1272	mutex_exit(&mi->mi_lock);
1273
1274	if (error == EAGAIN)
1275		nfs4_end_lock_seqid_sync(lop);
1276
1277	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1278	    "nfs4_start_lock_seqid_sync: error=%d", error));
1279
1280done:
1281	return (error);
1282}
1283
1284/*
1285 * This ends our use of the lock owner's lock seqid by setting
1286 * the appropiate flags and issuing a cv_signal to wake up another
1287 * thread waiting to use the lock seqid.
1288 */
1289void
1290nfs4_end_lock_seqid_sync(nfs4_lock_owner_t *lop)
1291{
1292	mutex_enter(&lop->lo_lock);
1293	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1294	ASSERT(lop->lo_seqid_holder == curthread);
1295	lop->lo_flags &= ~NFS4_LOCK_SEQID_INUSE;
1296	lop->lo_seqid_holder = NULL;
1297	cv_signal(&lop->lo_cv_seqid_sync);
1298	mutex_exit(&lop->lo_lock);
1299}
1300
1301/*
1302 * Returns a reference to a lock owner via lopp, which has its lock seqid
1303 * synchronization started.
1304 * If the lock owner is in the 'just_created' state, then we return its open
1305 * owner and open stream and start the open seqid synchronization.
1306 *
1307 * Return value:
1308 * NFS4_OK		no problems
1309 * NFS4ERR_DELAY	there is lost state to recover; caller should retry
1310 * NFS4ERR_IO		no open stream
1311 */
1312nfsstat4
1313nfs4_find_or_create_lock_owner(pid_t pid, rnode4_t *rp, cred_t *cr,
1314    nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
1315    nfs4_lock_owner_t **lopp)
1316{
1317	nfs4_lock_owner_t *lop, *next_lop;
1318	mntinfo4_t *mi;
1319	int error = 0;
1320	nfsstat4 stat;
1321
1322	mi = VTOMI4(RTOV4(rp));
1323
1324	mutex_enter(&rp->r_statev4_lock);
1325
1326	lop = rp->r_lo_head.lo_next_rnode;
1327	while (lop != &rp->r_lo_head) {
1328		mutex_enter(&lop->lo_lock);
1329		if (lop->lo_pid == pid && lop->lo_valid != 0) {
1330			/* Found a matching lock owner */
1331			NFS4_DEBUG(nfs4_client_state_debug,
1332			    (CE_NOTE, "nfs4_find_or_create_lock_owner: "
1333			    "got a match"));
1334			lop->lo_ref_count++;
1335			break;
1336		}
1337		next_lop = lop->lo_next_rnode;
1338		mutex_exit(&lop->lo_lock);
1339		lop = next_lop;
1340	}
1341
1342	if (lop == &rp->r_lo_head) {
1343		/* create temporary lock owner */
1344		lop = create_lock_owner(rp, pid);
1345	}
1346	mutex_exit(&rp->r_statev4_lock);
1347
1348	/* Have a locked down lock owner struct now */
1349	if (lop->lo_just_created != NFS4_JUST_CREATED) {
1350		/* This is an existing lock owner */
1351		*oopp = NULL;
1352		*ospp = NULL;
1353	} else {
1354		/* Lock owner doesn't exist yet */
1355
1356		/* First grab open owner seqid synchronization */
1357		mutex_exit(&lop->lo_lock);
1358		*oopp = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1359		if (*oopp == NULL)
1360			goto kill_new_lop;
1361		error = nfs4_start_open_seqid_sync(*oopp, mi);
1362		if (error == EAGAIN) {
1363			stat = NFS4ERR_DELAY;
1364			goto failed;
1365		}
1366		*ospp = find_open_stream(*oopp, rp);
1367		if (*ospp == NULL) {
1368			nfs4_end_open_seqid_sync(*oopp);
1369			goto kill_new_lop;
1370		}
1371		if ((*ospp)->os_failed_reopen) {
1372			mutex_exit(&(*ospp)->os_sync_lock);
1373			NFS4_DEBUG((nfs4_open_stream_debug ||
1374			    nfs4_client_lock_debug), (CE_NOTE,
1375			    "nfs4_find_or_create_lock_owner: os_failed_reopen;"
1376			    "osp %p, cr %p, rp %s", (void *)(*ospp),
1377			    (void *)cr, rnode4info(rp)));
1378			nfs4_end_open_seqid_sync(*oopp);
1379			stat = NFS4ERR_IO;
1380			goto failed;
1381		}
1382		mutex_exit(&(*ospp)->os_sync_lock);
1383
1384		/*
1385		 * Now see if the lock owner has become permanent while we
1386		 * had released our lock.
1387		 */
1388		mutex_enter(&lop->lo_lock);
1389		if (lop->lo_just_created != NFS4_JUST_CREATED) {
1390			nfs4_end_open_seqid_sync(*oopp);
1391			open_stream_rele(*ospp, rp);
1392			open_owner_rele(*oopp);
1393			*oopp = NULL;
1394			*ospp = NULL;
1395		}
1396	}
1397	mutex_exit(&lop->lo_lock);
1398
1399	error = nfs4_start_lock_seqid_sync(lop, mi);
1400	if (error == EAGAIN) {
1401		if (*oopp != NULL)
1402			nfs4_end_open_seqid_sync(*oopp);
1403		stat = NFS4ERR_DELAY;
1404		goto failed;
1405	}
1406	ASSERT(error == 0);
1407
1408	*lopp = lop;
1409	return (NFS4_OK);
1410
1411kill_new_lop:
1412	/*
1413	 * A previous CLOSE was attempted but got EINTR, but the application
1414	 * continued to use the unspecified state file descriptor.  But now the
1415	 * open stream is gone (which could also destroy the open owner), hence
1416	 * we can no longer continue.  The calling function should return EIO
1417	 * to the application.
1418	 */
1419	NFS4_DEBUG(nfs4_lost_rqst_debug || nfs4_client_lock_debug,
1420	    (CE_NOTE, "nfs4_find_or_create_lock_owner: destroy newly created "
1421	    "lop %p, oop %p, osp %p", (void *)lop, (void *)(*oopp),
1422	    (void *)(*ospp)));
1423
1424	nfs4_rnode_remove_lock_owner(rp, lop);
1425	stat = NFS4ERR_IO;
1426
1427failed:
1428	lock_owner_rele(lop);
1429	if (*oopp) {
1430		open_owner_rele(*oopp);
1431		*oopp = NULL;
1432	}
1433	if (*ospp) {
1434		open_stream_rele(*ospp, rp);
1435		*ospp = NULL;
1436	}
1437	return (stat);
1438}
1439
1440/*
1441 * This function grabs a recently freed open owner off of the freed open
1442 * owner list if there is a match on the cred 'cr'.  It returns NULL if no
1443 * such match is found.  It will set the 'oo_ref_count' and 'oo_valid' back
1444 * to both 1 (sane values) in the case a match is found.
1445 */
1446static nfs4_open_owner_t *
1447find_freed_open_owner(cred_t *cr, nfs4_oo_hash_bucket_t *bucketp,
1448    mntinfo4_t *mi)
1449{
1450	nfs4_open_owner_t		*foop;
1451
1452	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1453	    "find_freed_open_owner: cred %p", (void*)cr));
1454
1455	ASSERT(mutex_owned(&mi->mi_lock));
1456	ASSERT(mutex_owned(&bucketp->b_lock));
1457
1458	/* got hash bucket, search through freed open owners */
1459	for (foop = list_head(&mi->mi_foo_list); foop != NULL;
1460	    foop = list_next(&mi->mi_foo_list, foop)) {
1461		if (!crcmp(foop->oo_cred, cr)) {
1462			NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1463			    "find_freed_open_owner: got a match open owner "
1464			    "%p", (void *)foop));
1465			foop->oo_ref_count = 1;
1466			foop->oo_valid = 1;
1467			list_remove(&mi->mi_foo_list, foop);
1468			mi->mi_foo_num--;
1469
1470			/* now add the struct into the cred hash table */
1471			list_insert_head(&bucketp->b_oo_hash_list, foop);
1472			return (foop);
1473		}
1474	}
1475
1476	return (NULL);
1477}
1478
1479/*
1480 * Insert the newly freed 'oop' into the mi's freed oop list,
1481 * always at the head of the list.  If we've already reached
1482 * our maximum allowed number of freed open owners (mi_foo_max),
1483 * then remove the LRU open owner on the list (namely the tail).
1484 */
1485static void
1486nfs4_free_open_owner(nfs4_open_owner_t *oop, mntinfo4_t *mi)
1487{
1488	nfs4_open_owner_t *lru_foop;
1489
1490	if (mi->mi_foo_num < mi->mi_foo_max) {
1491		NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1492		    "nfs4_free_open_owner: num free %d, max free %d, "
1493		    "insert open owner %p for mntinfo4 %p",
1494		    mi->mi_foo_num, mi->mi_foo_max, (void *)oop,
1495		    (void *)mi));
1496		list_insert_head(&mi->mi_foo_list, oop);
1497		mi->mi_foo_num++;
1498		return;
1499	}
1500
1501	/* need to replace a freed open owner */
1502
1503	lru_foop = list_tail(&mi->mi_foo_list);
1504
1505	NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1506	    "nfs4_free_open_owner: destroy %p, insert %p",
1507	    (void *)lru_foop, (void *)oop));
1508
1509	list_remove(&mi->mi_foo_list, lru_foop);
1510	nfs4_destroy_open_owner(lru_foop);
1511
1512	/* head always has latest freed oop */
1513	list_insert_head(&mi->mi_foo_list, oop);
1514}
1515
1516void
1517nfs4_destroy_open_owner(nfs4_open_owner_t *oop)
1518{
1519	ASSERT(oop != NULL);
1520
1521	crfree(oop->oo_cred);
1522	if (oop->oo_cred_otw)
1523		crfree(oop->oo_cred_otw);
1524	mutex_destroy(&oop->oo_lock);
1525	cv_destroy(&oop->oo_cv_seqid_sync);
1526	kmem_free(oop, sizeof (*oop));
1527}
1528
1529seqid4
1530nfs4_get_open_seqid(nfs4_open_owner_t *oop)
1531{
1532	ASSERT(oop->oo_seqid_inuse);
1533	return (oop->oo_seqid);
1534}
1535
1536/*
1537 * This set's the open seqid for a <open owner/ mntinfo4> pair.
1538 */
1539void
1540nfs4_set_open_seqid(seqid4 seqid, nfs4_open_owner_t *oop,
1541    nfs4_tag_type_t tag_type)
1542{
1543	ASSERT(oop->oo_seqid_inuse);
1544	oop->oo_seqid = seqid;
1545	oop->oo_last_good_seqid = seqid;
1546	oop->oo_last_good_op = tag_type;
1547}
1548
1549/*
1550 * This bumps the current open seqid for the open owner 'oop'.
1551 */
1552void
1553nfs4_get_and_set_next_open_seqid(nfs4_open_owner_t *oop,
1554    nfs4_tag_type_t tag_type)
1555{
1556	ASSERT(oop->oo_seqid_inuse);
1557	oop->oo_seqid++;
1558	oop->oo_last_good_seqid = oop->oo_seqid;
1559	oop->oo_last_good_op = tag_type;
1560}
1561
1562/*
1563 * If no open owner was provided, this function takes the cred to find an
1564 * open owner within the given mntinfo4_t.  Either way we return the
1565 * open owner's OTW credential if it exists; otherwise returns the
1566 * supplied 'cr'.
1567 *
1568 * A hold is put on the returned credential, and it is up to the caller
1569 * to free the cred.
1570 */
1571cred_t *
1572nfs4_get_otw_cred(cred_t *cr, mntinfo4_t *mi, nfs4_open_owner_t *provided_oop)
1573{
1574	cred_t *ret_cr;
1575	nfs4_open_owner_t *oop = provided_oop;
1576
1577	if (oop == NULL)
1578		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1579	if (oop != NULL) {
1580		mutex_enter(&oop->oo_lock);
1581		if (oop->oo_cred_otw)
1582			ret_cr = oop->oo_cred_otw;
1583		else
1584			ret_cr = cr;
1585		crhold(ret_cr);
1586		mutex_exit(&oop->oo_lock);
1587		if (provided_oop == NULL)
1588			open_owner_rele(oop);
1589	} else {
1590		ret_cr = cr;
1591		crhold(ret_cr);
1592	}
1593	return (ret_cr);
1594}
1595
1596/*
1597 * Retrieves the next open stream in the rnode's list if an open stream
1598 * is provided; otherwise gets the first open stream in the list.
1599 * The open owner for that open stream is then retrieved, and if its
1600 * oo_cred_otw exists then it is returned; otherwise the provided 'cr'
1601 * is returned.  *osp is set to the 'found' open stream.
1602 *
1603 * Note: we don't set *osp to the open stream retrieved via the
1604 * optimized check since that won't necessarily be at the beginning
1605 * of the rnode list, and if that osp doesn't work we'd like to
1606 * check _all_ open streams (starting from the beginning of the
1607 * rnode list).
1608 */
1609cred_t *
1610nfs4_get_otw_cred_by_osp(rnode4_t *rp, cred_t *cr,
1611    nfs4_open_stream_t **osp, bool_t *first_time, bool_t *last_time)
1612{
1613	nfs4_open_stream_t *next_osp = NULL;
1614	cred_t *ret_cr;
1615
1616	ASSERT(cr != NULL);
1617	/*
1618	 * As an optimization, try to find the open owner
1619	 * for the cred provided since that's most likely
1620	 * to work.
1621	 */
1622	if (*first_time) {
1623		nfs4_open_owner_t *oop;
1624
1625		oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(RTOV4(rp)));
1626		if (oop) {
1627			next_osp = find_open_stream(oop, rp);
1628			if (next_osp)
1629				mutex_exit(&next_osp->os_sync_lock);
1630			open_owner_rele(oop);
1631		}
1632	}
1633	if (next_osp == NULL) {
1634		int delay_rele = 0;
1635		*first_time = FALSE;
1636
1637		/* return the next open stream for this rnode */
1638		mutex_enter(&rp->r_os_lock);
1639		/* Now, no one can add or delete to rp's open streams list */
1640
1641		if (*osp) {
1642			next_osp = list_next(&rp->r_open_streams, *osp);
1643			/*
1644			 * Delay the rele of *osp until after we drop
1645			 * r_os_lock to not deadlock with oo_lock
1646			 * via an open_stream_rele()->open_owner_rele().
1647			 */
1648			delay_rele = 1;
1649		} else {
1650			next_osp = list_head(&rp->r_open_streams);
1651		}
1652		if (next_osp) {
1653			nfs4_open_stream_t *tmp_osp;
1654
1655			/* find the next valid open stream */
1656			mutex_enter(&next_osp->os_sync_lock);
1657			while (next_osp && !next_osp->os_valid) {
1658				tmp_osp =
1659				    list_next(&rp->r_open_streams, next_osp);
1660				mutex_exit(&next_osp->os_sync_lock);
1661				next_osp = tmp_osp;
1662				if (next_osp)
1663					mutex_enter(&next_osp->os_sync_lock);
1664			}
1665			if (next_osp) {
1666				next_osp->os_ref_count++;
1667				mutex_exit(&next_osp->os_sync_lock);
1668			}
1669		}
1670		mutex_exit(&rp->r_os_lock);
1671		if (delay_rele)
1672			open_stream_rele(*osp, rp);
1673	}
1674
1675	if (next_osp) {
1676		nfs4_open_owner_t *oop;
1677
1678		oop = next_osp->os_open_owner;
1679		mutex_enter(&oop->oo_lock);
1680		if (oop->oo_cred_otw)
1681			ret_cr = oop->oo_cred_otw;
1682		else
1683			ret_cr = cr;
1684		crhold(ret_cr);
1685		mutex_exit(&oop->oo_lock);
1686		if (*first_time) {
1687			open_stream_rele(next_osp, rp);
1688			*osp = NULL;
1689		} else
1690			*osp = next_osp;
1691	} else {
1692		/* just return the cred provided to us */
1693		*last_time = TRUE;
1694		*osp = NULL;
1695		ret_cr = cr;
1696		crhold(ret_cr);
1697	}
1698
1699	*first_time = FALSE;
1700	return (ret_cr);
1701}
1702
1703void
1704nfs4_init_stateid_types(nfs4_stateid_types_t *sid_tp)
1705{
1706	bzero(&sid_tp->d_sid, sizeof (stateid4));
1707	bzero(&sid_tp->l_sid, sizeof (stateid4));
1708	bzero(&sid_tp->o_sid, sizeof (stateid4));
1709	sid_tp->cur_sid_type = NO_SID;
1710}
1711
1712void
1713nfs4_save_stateid(stateid4 *s1, nfs4_stateid_types_t *sid_tp)
1714{
1715	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1716	    "nfs4_save_stateid: saved %s stateid",
1717	    sid_tp->cur_sid_type == DEL_SID ? "delegation" :
1718	    sid_tp->cur_sid_type == LOCK_SID ? "lock" :
1719	    sid_tp->cur_sid_type == OPEN_SID ? "open" : "special"));
1720
1721	switch (sid_tp->cur_sid_type) {
1722	case DEL_SID:
1723		sid_tp->d_sid = *s1;
1724		break;
1725	case LOCK_SID:
1726		sid_tp->l_sid = *s1;
1727		break;
1728	case OPEN_SID:
1729		sid_tp->o_sid = *s1;
1730		break;
1731	case SPEC_SID:
1732	default:
1733		cmn_err(CE_PANIC, "nfs4_save_stateid: illegal "
1734		    "stateid type %d", sid_tp->cur_sid_type);
1735	}
1736}
1737
1738/*
1739 * We got NFS4ERR_BAD_SEQID.  Setup some arguments to pass to recovery.
1740 * Caller is responsible for freeing.
1741 */
1742nfs4_bseqid_entry_t *
1743nfs4_create_bseqid_entry(nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop,
1744    vnode_t *vp, pid_t pid, nfs4_tag_type_t tag, seqid4 seqid)
1745{
1746	nfs4_bseqid_entry_t	*bsep;
1747
1748	bsep = kmem_alloc(sizeof (*bsep), KM_SLEEP);
1749	bsep->bs_oop = oop;
1750	bsep->bs_lop = lop;
1751	bsep->bs_vp = vp;
1752	bsep->bs_pid = pid;
1753	bsep->bs_tag = tag;
1754	bsep->bs_seqid = seqid;
1755
1756	return (bsep);
1757}
1758
1759void
1760nfs4open_dg_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1761    nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
1762    vnode_t *vp, int access_close, int deny_close)
1763{
1764	lost_rqstp->lr_putfirst = FALSE;
1765
1766	ASSERT(vp != NULL);
1767	if (error == ETIMEDOUT || error == EINTR ||
1768	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1769		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1770		    "nfs4open_dg_save_lost_rqst: error %d", error));
1771
1772		lost_rqstp->lr_op = OP_OPEN_DOWNGRADE;
1773		/*
1774		 * The vp is held and rele'd via the recovery code.
1775		 * See nfs4_save_lost_rqst.
1776		 */
1777		lost_rqstp->lr_vp = vp;
1778		lost_rqstp->lr_dvp = NULL;
1779		lost_rqstp->lr_oop = oop;
1780		lost_rqstp->lr_osp = osp;
1781		lost_rqstp->lr_lop = NULL;
1782		lost_rqstp->lr_cr = cr;
1783		lost_rqstp->lr_flk = NULL;
1784		lost_rqstp->lr_dg_acc = access_close;
1785		lost_rqstp->lr_dg_deny = deny_close;
1786		lost_rqstp->lr_putfirst = FALSE;
1787	} else {
1788		lost_rqstp->lr_op = 0;
1789	}
1790}
1791
1792/*
1793 * Change the access and deny bits of an OPEN.
1794 * If recovery is needed, *recov_credpp is set to the cred used OTW,
1795 * a hold is placed on it, and *recov_seqidp is set to the seqid used OTW.
1796 */
1797void
1798nfs4_open_downgrade(int access_close, int deny_close, nfs4_open_owner_t *oop,
1799    nfs4_open_stream_t *osp, vnode_t *vp, cred_t *cr, nfs4_lost_rqst_t *lrp,
1800    nfs4_error_t *ep, cred_t **recov_credpp, seqid4 *recov_seqidp)
1801{
1802	mntinfo4_t		*mi;
1803	int			downgrade_acc, downgrade_deny;
1804	int			new_acc, new_deny;
1805	COMPOUND4args_clnt	args;
1806	COMPOUND4res_clnt	res;
1807	OPEN_DOWNGRADE4res	*odg_res;
1808	nfs_argop4		argop[3];
1809	nfs_resop4		*resop;
1810	rnode4_t		*rp;
1811	bool_t			needrecov = FALSE;
1812	int			doqueue = 1;
1813	seqid4			seqid = 0;
1814	cred_t			*cred_otw;
1815	hrtime_t		t;
1816
1817	ASSERT(mutex_owned(&osp->os_sync_lock));
1818#if DEBUG
1819	mutex_enter(&oop->oo_lock);
1820	ASSERT(oop->oo_seqid_inuse);
1821	mutex_exit(&oop->oo_lock);
1822#endif
1823
1824
1825	if (access_close == 0 && deny_close == 0) {
1826		nfs4_error_zinit(ep);
1827		return;
1828	}
1829
1830	cred_otw = nfs4_get_otw_cred(cr, VTOMI4(vp), oop);
1831
1832cred_retry:
1833	nfs4_error_zinit(ep);
1834	downgrade_acc = 0;
1835	downgrade_deny = 0;
1836	mi = VTOMI4(vp);
1837	rp = VTOR4(vp);
1838
1839	/*
1840	 * Check to see if the open stream got closed before we go OTW,
1841	 * now that we have acquired the 'os_sync_lock'.
1842	 */
1843	if (!osp->os_valid) {
1844		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1845		    " open stream has already been closed, return success"));
1846		/* error has already been set */
1847		goto no_args_out;
1848	}
1849
1850	/* If the file failed recovery, just quit. */
1851	mutex_enter(&rp->r_statelock);
1852	if (rp->r_flags & R4RECOVERR) {
1853		mutex_exit(&rp->r_statelock);
1854		ep->error = EIO;
1855		goto no_args_out;
1856	}
1857	mutex_exit(&rp->r_statelock);
1858
1859	seqid = nfs4_get_open_seqid(oop) + 1;
1860
1861	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1862	    "access_close %d, acc_read %"PRIu64" acc_write %"PRIu64"",
1863	    access_close, osp->os_share_acc_read, osp->os_share_acc_write));
1864
1865	/* If we're closing the last READ, need to downgrade */
1866	if ((access_close & FREAD) && (osp->os_share_acc_read == 1))
1867		downgrade_acc |= OPEN4_SHARE_ACCESS_READ;
1868
1869	/* if we're closing the last WRITE, need to downgrade */
1870	if ((access_close & FWRITE) && (osp->os_share_acc_write == 1))
1871		downgrade_acc |= OPEN4_SHARE_ACCESS_WRITE;
1872
1873	downgrade_deny = OPEN4_SHARE_DENY_NONE;
1874
1875	new_acc = 0;
1876	new_deny = 0;
1877
1878	/* set our new access and deny share bits */
1879	if ((osp->os_share_acc_read > 0) &&
1880	    !(downgrade_acc & OPEN4_SHARE_ACCESS_READ))
1881		new_acc |= OPEN4_SHARE_ACCESS_READ;
1882	if ((osp->os_share_acc_write > 0) &&
1883	    !(downgrade_acc & OPEN4_SHARE_ACCESS_WRITE))
1884		new_acc |= OPEN4_SHARE_ACCESS_WRITE;
1885
1886	new_deny = OPEN4_SHARE_DENY_NONE;
1887
1888	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1889	    "downgrade acc 0x%x deny 0x%x", downgrade_acc, downgrade_deny));
1890	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1891	    "new acc 0x%x deny 0x%x", new_acc, new_deny));
1892
1893	/*
1894	 * Check to see if we aren't actually doing any downgrade or
1895	 * if this is the last 'close' but the file is still mmapped.
1896	 * Skip this if this a lost request resend so we don't decrement
1897	 * the osp's share counts more than once.
1898	 */
1899	if (!lrp &&
1900	    ((downgrade_acc == 0 && downgrade_deny == 0) ||
1901	    (new_acc == 0 && new_deny == 0))) {
1902		/*
1903		 * No downgrade to do, but still need to
1904		 * update osp's os_share_* counts.
1905		 */
1906		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE,
1907		    "nfs4_open_downgrade: just lower the osp's count by %s",
1908		    (access_close & FREAD) && (access_close & FWRITE) ?
1909		    "read and write" : (access_close & FREAD) ? "read" :
1910		    (access_close & FWRITE) ? "write" : "bogus"));
1911		if (access_close & FREAD)
1912			osp->os_share_acc_read--;
1913		if (access_close & FWRITE)
1914			osp->os_share_acc_write--;
1915		osp->os_share_deny_none--;
1916		nfs4_error_zinit(ep);
1917
1918		goto no_args_out;
1919	}
1920
1921	if (osp->os_orig_oo_name != oop->oo_name) {
1922		ep->error = EIO;
1923		goto no_args_out;
1924	}
1925
1926	/* setup the COMPOUND args */
1927	if (lrp)
1928		args.ctag = TAG_OPEN_DG_LOST;
1929	else
1930		args.ctag = TAG_OPEN_DG;
1931
1932	args.array_len = 3;
1933	args.array = argop;
1934
1935	/* putfh */
1936	argop[0].argop = OP_CPUTFH;
1937	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1938
1939	argop[1].argop = OP_GETATTR;
1940	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1941	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1942
1943	ASSERT(mutex_owned(&osp->os_sync_lock));
1944	ASSERT(osp->os_delegation == FALSE);
1945
1946	/* open downgrade */
1947	argop[2].argop = OP_OPEN_DOWNGRADE;
1948	argop[2].nfs_argop4_u.opopen_downgrade.open_stateid = osp->open_stateid;
1949	argop[2].nfs_argop4_u.opopen_downgrade.share_access = new_acc;
1950	argop[2].nfs_argop4_u.opopen_downgrade.share_deny = new_deny;
1951	argop[2].nfs_argop4_u.opopen_downgrade.seqid = seqid;
1952
1953	t = gethrtime();
1954
1955	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1956
1957	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
1958		nfs4_set_open_seqid(seqid, oop, args.ctag);
1959
1960	if ((ep->error == EACCES ||
1961	    (ep->error == 0 && res.status == NFS4ERR_ACCESS)) &&
1962	    cred_otw != cr) {
1963		crfree(cred_otw);
1964		cred_otw = cr;
1965		crhold(cred_otw);
1966		if (!ep->error)
1967			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1968		goto cred_retry;
1969	}
1970
1971	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
1972
1973	if (needrecov && recov_credpp) {
1974		*recov_credpp = cred_otw;
1975		crhold(*recov_credpp);
1976		if (recov_seqidp)
1977			*recov_seqidp = seqid;
1978	}
1979
1980	if (!ep->error && !res.status) {
1981		/* get the open downgrade results */
1982		resop = &res.array[2];
1983		odg_res = &resop->nfs_resop4_u.opopen_downgrade;
1984
1985		osp->open_stateid = odg_res->open_stateid;
1986
1987		/* set the open streams new access/deny bits */
1988		if (access_close & FREAD)
1989			osp->os_share_acc_read--;
1990		if (access_close & FWRITE)
1991			osp->os_share_acc_write--;
1992		osp->os_share_deny_none--;
1993		osp->os_dc_openacc = new_acc;
1994
1995		nfs4_attr_cache(vp,
1996		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
1997		    t, cred_otw, TRUE, NULL);
1998	}
1999
2000	if (!ep->error)
2001		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2002
2003no_args_out:
2004	crfree(cred_otw);
2005}
2006
2007/*
2008 * If an OPEN request gets ETIMEDOUT or EINTR (that includes bailing out
2009 * because the filesystem was forcibly unmounted) then we don't know if we
2010 * potentially left state dangling on the server, therefore the recovery
2011 * framework makes this call to resend the OPEN request and then undo it.
2012 */
2013void
2014nfs4_resend_open_otw(vnode_t **vpp, nfs4_lost_rqst_t *resend_rqstp,
2015    nfs4_error_t *ep)
2016{
2017	COMPOUND4args_clnt	args;
2018	COMPOUND4res_clnt	res;
2019	nfs_argop4		argop[4];
2020	GETFH4res		*gf_res = NULL;
2021	OPEN4cargs		*open_args;
2022	OPEN4res		*op_res;
2023	char			*destcfp;
2024	int			destclen;
2025	nfs4_ga_res_t		*garp;
2026	vnode_t			*dvp = NULL, *vp = NULL;
2027	rnode4_t		*rp = NULL, *drp = NULL;
2028	cred_t			*cr = NULL;
2029	seqid4			seqid;
2030	nfs4_open_owner_t	*oop = NULL;
2031	nfs4_open_stream_t	*osp = NULL;
2032	component4		*srcfp;
2033	open_claim_type4	claim;
2034	mntinfo4_t		*mi;
2035	int			doqueue = 1;
2036	bool_t			retry_open = FALSE;
2037	int			created_osp = 0;
2038	hrtime_t		t;
2039	char			*failed_msg = "";
2040	int			fh_different;
2041	int			reopen = 0;
2042
2043	nfs4_error_zinit(ep);
2044
2045	cr = resend_rqstp->lr_cr;
2046	dvp = resend_rqstp->lr_dvp;
2047
2048	vp = *vpp;
2049	if (vp) {
2050		ASSERT(nfs4_consistent_type(vp));
2051		rp = VTOR4(vp);
2052	}
2053
2054	if (rp) {
2055		/* If the file failed recovery, just quit. */
2056		mutex_enter(&rp->r_statelock);
2057		if (rp->r_flags & R4RECOVERR) {
2058			mutex_exit(&rp->r_statelock);
2059			ep->error = EIO;
2060			return;
2061		}
2062		mutex_exit(&rp->r_statelock);
2063	}
2064
2065	if (dvp) {
2066		drp = VTOR4(dvp);
2067		/* If the parent directory failed recovery, just quit. */
2068		mutex_enter(&drp->r_statelock);
2069		if (drp->r_flags & R4RECOVERR) {
2070			mutex_exit(&drp->r_statelock);
2071			ep->error = EIO;
2072			return;
2073		}
2074		mutex_exit(&drp->r_statelock);
2075	} else
2076		reopen = 1;	/* NULL dvp means this is a reopen */
2077
2078	claim = resend_rqstp->lr_oclaim;
2079	ASSERT(claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR);
2080
2081	args.ctag = TAG_OPEN_LOST;
2082	args.array_len = 4;
2083	args.array = argop;
2084
2085	argop[0].argop = OP_CPUTFH;
2086	if (reopen) {
2087		ASSERT(vp != NULL);
2088
2089		mi = VTOMI4(vp);
2090		/*
2091		 * if this is a file mount then
2092		 * use the mntinfo parentfh
2093		 */
2094		argop[0].nfs_argop4_u.opcputfh.sfh =
2095		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
2096		    VTOSV(vp)->sv_dfh;
2097		args.ctag = TAG_REOPEN_LOST;
2098	} else {
2099		argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
2100		mi = VTOMI4(dvp);
2101	}
2102
2103	argop[1].argop = OP_COPEN;
2104	open_args = &argop[1].nfs_argop4_u.opcopen;
2105	open_args->claim = claim;
2106
2107	/*
2108	 * If we sent over a OPEN with CREATE then the only
2109	 * thing we care about is to not leave dangling state
2110	 * on the server, not whether the file we potentially
2111	 * created remains on the server.  So even though the
2112	 * lost open request specified a CREATE, we only wish
2113	 * to do a non-CREATE OPEN.
2114	 */
2115	open_args->opentype = OPEN4_NOCREATE;
2116
2117	srcfp = &resend_rqstp->lr_ofile;
2118	destclen = srcfp->utf8string_len;
2119	destcfp = kmem_alloc(destclen + 1, KM_SLEEP);
2120	bcopy(srcfp->utf8string_val, destcfp, destclen);
2121	destcfp[destclen] = '\0';
2122	if (claim == CLAIM_DELEGATE_CUR) {
2123		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
2124		    resend_rqstp->lr_ostateid;
2125		open_args->open_claim4_u.delegate_cur_info.cfile = destcfp;
2126	} else {
2127		open_args->open_claim4_u.cfile = destcfp;
2128	}
2129
2130	open_args->share_access = resend_rqstp->lr_oacc;
2131	open_args->share_deny = resend_rqstp->lr_odeny;
2132	oop = resend_rqstp->lr_oop;
2133	ASSERT(oop != NULL);
2134
2135	open_args->owner.clientid = mi2clientid(mi);
2136	/* this length never changes */
2137	open_args->owner.owner_len = sizeof (oop->oo_name);
2138	open_args->owner.owner_val =
2139	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
2140
2141	ep->error = nfs4_start_open_seqid_sync(oop, mi);
2142	ASSERT(ep->error == 0);		/* recov thread always succeeds */
2143	/*
2144	 * We can get away with not saving the seqid upon detection
2145	 * of a lost request, and now just use the open owner's current
2146	 * seqid since we only allow one op OTW per seqid and lost
2147	 * requests are saved FIFO.
2148	 */
2149	seqid = nfs4_get_open_seqid(oop) + 1;
2150	open_args->seqid = seqid;
2151
2152	bcopy(&oop->oo_name, open_args->owner.owner_val,
2153	    open_args->owner.owner_len);
2154
2155	/* getfh */
2156	argop[2].argop = OP_GETFH;
2157
2158	/* Construct the getattr part of the compound */
2159	argop[3].argop = OP_GETATTR;
2160	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2161	argop[3].nfs_argop4_u.opgetattr.mi = mi;
2162
2163	res.array = NULL;
2164
2165	t = gethrtime();
2166
2167	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
2168
2169	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
2170		nfs4_set_open_seqid(seqid, oop, args.ctag);
2171
2172	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2173	    "nfs4_resend_open_otw: error %d stat %d", ep->error, res.status));
2174
2175	if (ep->error || res.status)
2176		goto err_out;
2177
2178	op_res = &res.array[1].nfs_resop4_u.opopen;
2179	gf_res = &res.array[2].nfs_resop4_u.opgetfh;
2180	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2181
2182	if (!vp) {
2183		int rnode_err = 0;
2184		nfs4_sharedfh_t *sfh;
2185
2186		/*
2187		 * If we can't decode all the attributes they are not usable,
2188		 * just make the vnode.
2189		 */
2190
2191		sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
2192		*vpp = makenfs4node(sfh, garp, dvp->v_vfsp, t, cr, dvp,
2193		    fn_get(VTOSV(dvp)->sv_name,
2194		    open_args->open_claim4_u.cfile, sfh));
2195		sfh4_rele(&sfh);
2196		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2197		    "nfs4_resend_open_otw: made vp %p for file %s",
2198		    (void *)(*vpp), open_args->open_claim4_u.cfile));
2199
2200		if (ep->error)
2201			PURGE_ATTRCACHE4(*vpp);
2202
2203		/*
2204		 * For the newly created *vpp case, make sure the rnode
2205		 * isn't bad before using it.
2206		 */
2207		mutex_enter(&(VTOR4(*vpp))->r_statelock);
2208		if (VTOR4(*vpp)->r_flags & R4RECOVERR)
2209			rnode_err = EIO;
2210		mutex_exit(&(VTOR4(*vpp))->r_statelock);
2211
2212		if (rnode_err) {
2213			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2214			    "nfs4_resend_open_otw: rp %p is bad",
2215			    (void *)VTOR4(*vpp)));
2216			ep->error = rnode_err;
2217			goto err_out;
2218		}
2219
2220		vp = *vpp;
2221		rp = VTOR4(vp);
2222	}
2223
2224	if (reopen) {
2225		/*
2226		 * Check if the path we reopened really is the same
2227		 * file. We could end up in a situation were the file
2228		 * was removed and a new file created with the same name.
2229		 */
2230		(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2231		fh_different =
2232		    (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2233		if (fh_different) {
2234			if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2235			    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2236				/* Oops, we don't have the same file */
2237				if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2238					failed_msg =
2239					    "Couldn't reopen: Persistant "
2240					    "file handle changed";
2241				else
2242					failed_msg =
2243					    "Couldn't reopen: Volatile "
2244					    "(no expire on open) file handle "
2245					    "changed";
2246
2247				nfs4_end_open_seqid_sync(oop);
2248				kmem_free(destcfp, destclen + 1);
2249				nfs4args_copen_free(open_args);
2250				xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2251				nfs_rw_exit(&mi->mi_fh_lock);
2252				nfs4_fail_recov(vp, failed_msg, ep->error,
2253				    ep->stat);
2254				return;
2255			} else {
2256				/*
2257				 * We have volatile file handles that don't
2258				 * compare.  If the fids are the same then we
2259				 * assume that the file handle expired but the
2260				 * renode still refers to the same file object.
2261				 *
2262				 * First check that we have fids or not.
2263				 * If we don't we have a dumb server so we will
2264				 * just assume every thing is ok for now.
2265				 */
2266				if (!ep->error &&
2267				    garp->n4g_va.va_mask & AT_NODEID &&
2268				    rp->r_attr.va_mask & AT_NODEID &&
2269				    rp->r_attr.va_nodeid !=
2270				    garp->n4g_va.va_nodeid) {
2271					/*
2272					 * We have fids, but they don't
2273					 * compare. So kill the file.
2274					 */
2275					failed_msg =
2276					    "Couldn't reopen: file handle "
2277					    "changed due to mismatched fids";
2278					nfs4_end_open_seqid_sync(oop);
2279					kmem_free(destcfp, destclen + 1);
2280					nfs4args_copen_free(open_args);
2281					xdr_free(xdr_COMPOUND4res_clnt,
2282					    (caddr_t)&res);
2283					nfs_rw_exit(&mi->mi_fh_lock);
2284					nfs4_fail_recov(vp, failed_msg,
2285					    ep->error, ep->stat);
2286					return;
2287				} else {
2288					/*
2289					 * We have volatile file handles that
2290					 * refers to the same file (at least
2291					 * they have the same fid) or we don't
2292					 * have fids so we can't tell. :(. We'll
2293					 * be a kind and accepting client so
2294					 * we'll update the rnode's file
2295					 * handle with the otw handle.
2296					 *
2297					 * We need to drop mi->mi_fh_lock since
2298					 * sh4_update acquires it. Since there
2299					 * is only one recovery thread there is
2300					 * no race.
2301					 */
2302					nfs_rw_exit(&mi->mi_fh_lock);
2303					sfh4_update(rp->r_fh, &gf_res->object);
2304				}
2305			}
2306		} else {
2307			nfs_rw_exit(&mi->mi_fh_lock);
2308		}
2309	}
2310
2311	ASSERT(nfs4_consistent_type(vp));
2312
2313	if (op_res->rflags & OPEN4_RESULT_CONFIRM)
2314		nfs4open_confirm(vp, &seqid, &op_res->stateid, cr, TRUE,
2315		    &retry_open, oop, TRUE, ep, NULL);
2316	if (ep->error || ep->stat) {
2317		nfs4_end_open_seqid_sync(oop);
2318		kmem_free(destcfp, destclen + 1);
2319		nfs4args_copen_free(open_args);
2320		if (!ep->error)
2321			xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2322		return;
2323	}
2324
2325	if (reopen) {
2326		/*
2327		 * Doing a reopen here so the osp should already exist.
2328		 * If not, something changed or went very wrong.
2329		 *
2330		 * returns with 'os_sync_lock' held
2331		 */
2332		osp = find_open_stream(oop, rp);
2333		if (!osp) {
2334			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2335			    "nfs4_resend_open_otw: couldn't find osp"));
2336			ep->error = EINVAL;
2337			goto err_out;
2338		}
2339		osp->os_open_ref_count++;
2340	} else {
2341		mutex_enter(&oop->oo_lock);
2342		oop->oo_just_created = NFS4_PERM_CREATED;
2343		mutex_exit(&oop->oo_lock);
2344
2345		/* returns with 'os_sync_lock' held */
2346		osp = find_or_create_open_stream(oop, rp, &created_osp);
2347		if (!osp) {
2348			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2349			    "nfs4_resend_open_otw: couldn't create osp"));
2350			ep->error = EINVAL;
2351			goto err_out;
2352		}
2353	}
2354
2355	osp->open_stateid = op_res->stateid;
2356	osp->os_delegation = FALSE;
2357	/*
2358	 * Need to reset this bitfield for the possible case where we were
2359	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2360	 * we could retry the CLOSE, OPENed the file again.
2361	 */
2362	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2363	osp->os_final_close = 0;
2364	osp->os_force_close = 0;
2365
2366	if (!reopen) {
2367		if (open_args->share_access & OPEN4_SHARE_ACCESS_READ)
2368			osp->os_share_acc_read++;
2369		if (open_args->share_access & OPEN4_SHARE_ACCESS_WRITE)
2370			osp->os_share_acc_write++;
2371		osp->os_share_deny_none++;
2372	}
2373
2374	mutex_exit(&osp->os_sync_lock);
2375	if (created_osp)
2376		nfs4_inc_state_ref_count(mi);
2377	open_stream_rele(osp, rp);
2378
2379	nfs4_end_open_seqid_sync(oop);
2380
2381	/* accept delegation, if any */
2382	nfs4_delegation_accept(rp, claim, op_res, garp, cr);
2383
2384	kmem_free(destcfp, destclen + 1);
2385	nfs4args_copen_free(open_args);
2386
2387	if (claim == CLAIM_DELEGATE_CUR)
2388		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2389	else
2390		PURGE_ATTRCACHE4(vp);
2391
2392	xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2393
2394	ASSERT(nfs4_consistent_type(vp));
2395
2396	return;
2397
2398err_out:
2399	nfs4_end_open_seqid_sync(oop);
2400	kmem_free(destcfp, destclen + 1);
2401	nfs4args_copen_free(open_args);
2402	if (!ep->error)
2403		xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2404}
2405